Commit 25869601 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/hf_vlms.py
parents 56f40c53 c1d8795d
......@@ -8,6 +8,7 @@ build
dist
*.egg-info
venv
.venv/
.vscode/
temp
__pycache__
......
......@@ -2,7 +2,7 @@
exclude: ^tests/testdata/
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v4.6.0
hooks:
- id: check-added-large-files
- id: check-ast
......@@ -29,7 +29,7 @@ repos:
- id: mixed-line-ending
args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.8
rev: v0.6.8
hooks:
# Run the linter.
- id: ruff
......
......@@ -54,7 +54,7 @@ The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's pop
To install the `lm-eval` package from the github repository, run:
```bash
git clone https://github.com/EleutherAI/lm-evaluation-harness
git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e .
```
......
......@@ -253,9 +253,7 @@
"output_type": "display_data"
}
],
"source": [
"from lm_eval import api"
]
"source": []
},
{
"cell_type": "markdown",
......@@ -288,7 +286,7 @@
},
"outputs": [],
"source": [
"YAML_boolq_string = '''\n",
"YAML_boolq_string = \"\"\"\n",
"task: demo_boolq\n",
"dataset_path: super_glue\n",
"dataset_name: boolq\n",
......@@ -302,8 +300,8 @@
"doc_to_decontamination_query: passage\n",
"metric_list:\n",
" - metric: acc\n",
"'''\n",
"with open('boolq.yaml', 'w') as f:\n",
"\"\"\"\n",
"with open(\"boolq.yaml\", \"w\") as f:\n",
" f.write(YAML_boolq_string)"
]
},
......@@ -368,7 +366,7 @@
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
" --include_path ./ \\\n",
" --tasks demo_boolq \\\n",
" --limit 10\n"
" --limit 10"
]
},
{
......@@ -394,7 +392,7 @@
},
"outputs": [],
"source": [
"YAML_cola_string = '''\n",
"YAML_cola_string = \"\"\"\n",
"tag: yes_or_no_tasks\n",
"task: demo_cola\n",
"dataset_path: glue\n",
......@@ -409,8 +407,8 @@
"doc_to_decontamination_query: sentence\n",
"metric_list:\n",
" - metric: acc\n",
"'''\n",
"with open('cola.yaml', 'w') as f:\n",
"\"\"\"\n",
"with open(\"cola.yaml\", \"w\") as f:\n",
" f.write(YAML_cola_string)"
]
},
......@@ -471,7 +469,7 @@
" --tasks yes_or_no_tasks \\\n",
" --limit 10 \\\n",
" --output output/yes_or_no_tasks/ \\\n",
" --log_samples\n"
" --log_samples"
]
},
{
......@@ -493,7 +491,7 @@
},
"outputs": [],
"source": [
"YAML_mmlu_geo_string = '''\n",
"YAML_mmlu_geo_string = \"\"\"\n",
"task: demo_mmlu_high_school_geography\n",
"dataset_path: cais/mmlu\n",
"dataset_name: high_school_geography\n",
......@@ -513,9 +511,9 @@
" - metric: acc_norm\n",
" aggregation: mean\n",
" higher_is_better: true\n",
"'''\n",
"with open('mmlu_high_school_geography.yaml', 'w') as f:\n",
" f.write(YAML_mmlu_geo_string)\n"
"\"\"\"\n",
"with open(\"mmlu_high_school_geography.yaml\", \"w\") as f:\n",
" f.write(YAML_mmlu_geo_string)"
]
},
{
......@@ -592,14 +590,14 @@
},
"outputs": [],
"source": [
"YAML_mmlu_geo_string = '''\n",
"YAML_mmlu_geo_string = \"\"\"\n",
"include: mmlu_high_school_geography.yaml\n",
"task: demo_mmlu_high_school_geography_continuation\n",
"doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
"doc_to_choice: \"{{choices}}\"\n",
"'''\n",
"with open('mmlu_high_school_geography_continuation.yaml', 'w') as f:\n",
" f.write(YAML_mmlu_geo_string)\n"
"\"\"\"\n",
"with open(\"mmlu_high_school_geography_continuation.yaml\", \"w\") as f:\n",
" f.write(YAML_mmlu_geo_string)"
]
},
{
......@@ -646,7 +644,7 @@
" --tasks demo_mmlu_high_school_geography_continuation \\\n",
" --limit 10 \\\n",
" --output output/mmlu_high_school_geography_continuation/ \\\n",
" --log_samples\n"
" --log_samples"
]
},
{
......@@ -678,7 +676,11 @@
],
"source": [
"from google.colab import files\n",
"files.view(\"output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")\n"
"\n",
"\n",
"files.view(\n",
" \"output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\"\n",
")"
]
},
{
......@@ -767,16 +769,16 @@
}
],
"source": [
"YAML_mmlu_geo_string = '''\n",
"YAML_mmlu_geo_string = \"\"\"\n",
"include: mmlu_high_school_geography.yaml\n",
"task: demo_mmlu_high_school_geography_function_prompt\n",
"doc_to_text: !function utils.doc_to_text\n",
"doc_to_choice: \"{{choices}}\"\n",
"'''\n",
"with open('demo_mmlu_high_school_geography_function_prompt.yaml', 'w') as f:\n",
"\"\"\"\n",
"with open(\"demo_mmlu_high_school_geography_function_prompt.yaml\", \"w\") as f:\n",
" f.write(YAML_mmlu_geo_string)\n",
"\n",
"DOC_TO_TEXT = '''\n",
"DOC_TO_TEXT = \"\"\"\n",
"def doc_to_text(x):\n",
" question = x[\"question\"].strip()\n",
" choices = x[\"choices\"]\n",
......@@ -785,8 +787,8 @@
" option_c = choices[2]\n",
" option_d = choices[3]\n",
" return f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
"'''\n",
"with open('utils.py', 'w') as f:\n",
"\"\"\"\n",
"with open(\"utils.py\", \"w\") as f:\n",
" f.write(DOC_TO_TEXT)\n",
"\n",
"!lm_eval \\\n",
......@@ -796,7 +798,7 @@
" --tasks demo_mmlu_high_school_geography_function_prompt \\\n",
" --limit 10 \\\n",
" --output output/demo_mmlu_high_school_geography_function_prompt/ \\\n",
" --log_samples\n"
" --log_samples"
]
},
{
......@@ -814,17 +816,17 @@
"metadata": {},
"outputs": [],
"source": [
"YAML_mmlu_geo_string = '''\n",
"YAML_mmlu_geo_string = \"\"\"\n",
"include: mmlu_high_school_geography.yaml\n",
"task: demo_mmlu_high_school_geography_function_prompt_2\n",
"process_docs: !function utils_process_docs.process_docs\n",
"doc_to_text: \"{{input}}\"\n",
"doc_to_choice: \"{{choices}}\"\n",
"'''\n",
"with open('demo_mmlu_high_school_geography_process_docs.yaml', 'w') as f:\n",
"\"\"\"\n",
"with open(\"demo_mmlu_high_school_geography_process_docs.yaml\", \"w\") as f:\n",
" f.write(YAML_mmlu_geo_string)\n",
"\n",
"DOC_TO_TEXT = '''\n",
"DOC_TO_TEXT = \"\"\"\n",
"def process_docs(dataset):\n",
" def _process_doc(x):\n",
" question = x[\"question\"].strip()\n",
......@@ -837,9 +839,9 @@
" return out_doc\n",
"\n",
" return dataset.map(_process_doc)\n",
"'''\n",
"\"\"\"\n",
"\n",
"with open('utils_process_docs.py', 'w') as f:\n",
"with open(\"utils_process_docs.py\", \"w\") as f:\n",
" f.write(DOC_TO_TEXT)\n",
"\n",
"!lm_eval \\\n",
......@@ -849,7 +851,7 @@
" --tasks demo_mmlu_high_school_geography_function_prompt_2 \\\n",
" --limit 10 \\\n",
" --output output/demo_mmlu_high_school_geography_function_prompt_2/ \\\n",
" --log_samples\n"
" --log_samples"
]
},
{
......
......@@ -68,6 +68,7 @@
"source": [
"import wandb\n",
"\n",
"\n",
"wandb.login()"
]
},
......@@ -130,6 +131,7 @@
"import lm_eval\n",
"from lm_eval.loggers import WandbLogger\n",
"\n",
"\n",
"results = lm_eval.simple_evaluate(\n",
" model=\"hf\",\n",
" model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n",
......
......@@ -431,7 +431,12 @@ class TemplateLM(LM):
using_default_template = False
# First, handle the cases when the model has a dict of multiple templates
template = self.tokenizer.chat_template or self.tokenizer.default_chat_template
try:
template = (
self.tokenizer.chat_template or self.tokenizer.default_chat_template
)
except AttributeError:
return None
if isinstance(template, dict):
using_default_dict = self.tokenizer.chat_template is None
......
......@@ -57,7 +57,6 @@ class TaskConfig(dict):
task: Optional[str] = None
task_alias: Optional[str] = None
tag: Optional[Union[str, list]] = None
group: Optional[Union[str, list]] = None
# HF dataset options.
# which dataset to use,
# and what splits for what purpose
......@@ -98,18 +97,6 @@ class TaskConfig(dict):
)
def __post_init__(self) -> None:
if self.group is not None:
eval_logger.warning(
"A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
)
if self.tag is None:
self.tag = self.group
else:
raise ValueError(
"Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
)
if self.generation_kwargs is not None:
if self.output_type != "generate_until":
eval_logger.warning(
......@@ -1511,7 +1498,7 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
elif type(gold) != type(result):
elif type(gold) is not type(result):
# cast gold to the same type as result
gold = type(result)(gold)
......@@ -1594,7 +1581,7 @@ class ConfigurableTask(Task):
f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
f"output_type={self.OUTPUT_TYPE},"
f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
f"num_samples={len(self.eval_docs)})",
f"num_samples={len(self.eval_docs)})"
)
......
......@@ -157,6 +157,9 @@ def simple_evaluate(
seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
torch.manual_seed(torch_random_seed)
if fewshot_random_seed is not None:
seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
if seed_message:
eval_logger.info(" | ".join(seed_message))
......@@ -276,9 +279,6 @@ def simple_evaluate(
task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
)
adjusted_task_dict[task_name] = task_obj
......@@ -433,10 +433,14 @@ def evaluate(
)
# end multimodality validation check
# Cache the limit arg.
limit_arg = limit
limits = []
for task_output in eval_tasks:
task: Task = task_output.task
limit = get_sample_size(task, limit)
limit = get_sample_size(task, limit_arg)
limits.append(limit)
task.build_all_requests(
limit=limit,
rank=lm.rank,
......@@ -506,7 +510,7 @@ def evaluate(
WORLD_SIZE = lm.world_size
### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for task_output in eval_tasks:
for task_output, limit in zip(eval_tasks, limits):
task = task_output.task
task.apply_filters()
......@@ -655,7 +659,7 @@ def evaluate(
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
for task_output, limit in zip(eval_tasks, limits)
},
}
if log_samples:
......
......@@ -73,9 +73,12 @@ class TemplateAPI(TemplateLM):
seed: int = 1234,
max_length: Optional[int] = 2048,
add_bos_token: bool = False,
custom_prefix_token_id=None,
custom_prefix_token_id: int = None,
# send the requests as tokens or strings
tokenized_requests=True,
tokenized_requests: bool = True,
trust_remote_code: bool = False,
revision: Optional[str] = "main",
use_fast_tokenizer: bool = True,
**kwargs,
) -> None:
super().__init__()
......@@ -128,7 +131,10 @@ class TemplateAPI(TemplateLM):
import transformers
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
self.tokenizer if self.tokenizer else self.model
self.tokenizer if self.tokenizer else self.model,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
)
# Not used as the API will handle padding but to mirror the behavior of the HFLM
self.tokenizer = configure_pad_token(self.tokenizer)
......@@ -153,6 +159,9 @@ class TemplateAPI(TemplateLM):
assert isinstance(tokenizer, str), "tokenizer must be a string"
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
)
@abc.abstractmethod
......
......@@ -26,9 +26,9 @@ class DummyLM(LM):
def generate_until(self, requests, disable_tqdm: bool = False):
res = []
for ctx, _ in tqdm(requests, disable=disable_tqdm):
for request in tqdm(requests, disable=disable_tqdm):
res.append("lol")
assert ctx.strip() != ""
assert request.arguments[0].strip() != ""
return res
......
......@@ -13,6 +13,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import (
Collator,
flatten_image_list,
pad_and_concat,
replace_placeholders,
stop_sequences_criteria,
......@@ -295,6 +296,11 @@ class HFMultimodalLM(HFLM):
images = [img[: self.max_images] for img in images]
if self.rgb:
images = [[img.convert("RGB") for img in sublist] for sublist in images]
# certain models like llava expect a single-level image list even for bs>1, multi-image. TODO: port this over to loglikelihoods
if getattr(self.config, "model_type", "") == "llava":
images = flatten_image_list(images)
try:
encoding = self.processor(
images=images,
......
......@@ -55,7 +55,7 @@ class HFLM(TemplateLM):
def __init__(
self,
pretrained: Union[str, transformers.PreTrainedModel],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
backend: Literal["default", "causal", "seq2seq"] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main",
subfolder: Optional[str] = None,
......@@ -90,7 +90,6 @@ class HFLM(TemplateLM):
**kwargs,
) -> None:
super().__init__()
# optionally: take in an already-initialized transformers.PreTrainedModel
if not isinstance(pretrained, str):
eval_logger.warning(
......@@ -164,7 +163,7 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code,
)
# determine which of 'causal' and 'seq2seq' backends to use
# determine which of 'causal' and 'seq2seq' backends to use for HF models
self._get_backend(
config=self.config, backend=backend, trust_remote_code=trust_remote_code
)
......@@ -287,7 +286,7 @@ class HFLM(TemplateLM):
def _get_accelerate_args(
self,
parallelize: bool = None,
parallelize: Optional[bool] = None,
device_map: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
......@@ -441,31 +440,26 @@ class HFLM(TemplateLM):
def _get_backend(
self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
backend: Literal["default", "causal", "seq2seq"] = "default",
trust_remote_code: Optional[bool] = False,
) -> None:
"""
Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
model type to be used.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
**If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
user must set `self.backend` to be either "causal" or "seq2seq" manually!**
"""
# escape hatch: if we're using a subclass that shouldn't follow
# the default _get_backend logic,
# then skip over the method.
# TODO: this seems very much undesirable in some cases--our code in HFLM
# references AutoModelForCausalLM at times to check for equality
if self.AUTO_MODEL_CLASS is not None:
return
assert backend in ["default", "causal", "seq2seq"]
if backend != "default":
# if we've settled on non-default backend, use that manually
if backend == "causal":
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
self.backend = backend
elif backend == "seq2seq":
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
self.backend = backend
eval_logger.info(
f"Overrode HF model backend type, and using type '{backend}'"
)
......@@ -478,26 +472,32 @@ class HFLM(TemplateLM):
# first check if model type is listed under seq2seq models, since some
# models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
# these special cases should be treated as seq2seq models.
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
self.backend = "seq2seq"
eval_logger.info(f"Using model type '{backend}'")
elif (
getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
):
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
self.backend = "causal"
eval_logger.info(f"Using model type '{backend}'")
else:
if not trust_remote_code:
eval_logger.warning(
"HF model type is neither marked as CausalLM or Seq2SeqLM. \
This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
"Setting backend to causal"
)
# if model type is neither in HF transformers causal or seq2seq model registries
# then we default to AutoModelForCausalLM
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
# then we default to assuming AutoModelForCausalLM
self.backend = "causal"
eval_logger.info(
f"Model type cannot be determined. Using default model type '{backend}'"
)
assert self.AUTO_MODEL_CLASS in [
transformers.AutoModelForCausalLM,
transformers.AutoModelForSeq2SeqLM,
]
return None
if self.AUTO_MODEL_CLASS is None:
if self.backend == "causal":
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
elif self.backend == "seq2seq":
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
def _get_config(
self,
......@@ -505,6 +505,7 @@ class HFLM(TemplateLM):
revision: str = "main",
trust_remote_code: bool = False,
) -> None:
"""Return the model config for HuggingFace models"""
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
......@@ -703,7 +704,7 @@ class HFLM(TemplateLM):
# if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size):
if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
if self.backend == "seq2seq":
length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones(
(batch_size, length), device=self.device
......@@ -754,7 +755,7 @@ class HFLM(TemplateLM):
# by default for CausalLM - false or self.add_bos_token is set
if add_special_tokens is None:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
special_tokens_kwargs = {
"add_special_tokens": False or self.add_bos_token
}
......@@ -782,7 +783,7 @@ class HFLM(TemplateLM):
self.tokenizer.padding_side = padding_side
add_special_tokens = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
encoding = self.tokenizer(
......@@ -860,14 +861,14 @@ class HFLM(TemplateLM):
def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None
) -> torch.Tensor:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
assert (
contlen and inplen
), "Must pass input len and cont. len to select scored logits for causal LM"
# discard right-padding.
# also discard the input/context tokens. we'll only score continuations.
logits = logits[inplen - contlen : inplen]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
assert (
contlen and not inplen
), "Selecting scored logits for Seq2SeqLM requires only cont. len"
......@@ -990,8 +991,7 @@ class HFLM(TemplateLM):
requests,
sort_fn=_collate,
group_by="contexts"
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
and self.logits_cache
if self.backend == "causal" and self.logits_cache
else None,
group_fn=_lookup_one_token_cont,
)
......@@ -1048,14 +1048,14 @@ class HFLM(TemplateLM):
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long,
device=self.device,
)
(inplen,) = inp.shape
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
inp = torch.tensor(
(context_enc)[-self.max_length :],
dtype=torch.long,
......@@ -1095,11 +1095,11 @@ class HFLM(TemplateLM):
# create encoder attn mask and batched conts, if seq2seq
call_kwargs = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
batched_inps = pad_and_concat(
padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
# TODO: left-pad encoder inps and mask?
batched_inps = pad_and_concat(
padding_len_inp, inps
......@@ -1130,7 +1130,7 @@ class HFLM(TemplateLM):
# from prompt/prefix tuning tokens, if applicable
ctx_len = (
inplen + (logits.shape[0] - padding_len_inp)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
if self.backend == "causal"
else None
)
logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
......@@ -1265,10 +1265,10 @@ class HFLM(TemplateLM):
max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc")
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
# max len for inputs = encoder's whole max_length
max_ctx_len = self.max_length
......@@ -1295,7 +1295,7 @@ class HFLM(TemplateLM):
cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only LM
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks)
......
import copy
import json
import logging
import subprocess
from collections import defaultdict
from typing import List, Optional, Union
......@@ -33,54 +31,6 @@ except ImportError:
logger = logging.getLogger(__name__)
def get_nc_count() -> Union[int, None]:
"""Returns the number of neuron cores on the current instance."""
try:
cmd = "neuron-ls --json-output"
result = subprocess.run(cmd, shell=True, capture_output=True)
print(f"inferring nc_count from `neuron-ls` {result.stdout}")
json_output = json.loads(result.stdout)
count = sum([x["nc_count"] for x in json_output])
print(f"nc_count={count}")
return count
except Exception:
return None
def wrap_constant_batch_size(func):
def _decorator(self, input_ids):
"""input_ids a 2D array with batch_size on dim=0
makes sure the func runs with self.batch_size
"""
# access a from TestSample
batch_size = input_ids.shape[0]
if batch_size < self.batch_size:
# handle the event of input_ids.shape[0] != batch_size
# Neuron cores expect constant batch_size
input_ids = torch.concat(
(
input_ids,
# add missing_batch_size dummy
torch.zeros(
[self.batch_size - batch_size, *input_ids.size()[1:]],
dtype=input_ids.dtype,
device=input_ids.device,
),
),
dim=0,
)
elif batch_size > self.batch_size:
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
# return the forward pass that requires constant batch size
return func(self, input_ids)[:batch_size]
return _decorator
class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
"""NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
......@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
elif batch_size < self.batch_size:
elif batch_size < self.batch_size and not self.continuous_batching:
logger.warning(
"Inputs will be padded to match the model static batch size. This will increase latency."
)
......@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
if attention_mask is not None:
padding = torch.zeros(padding_shape, dtype=torch.int64)
padded_attention_mask = torch.cat([attention_mask, padding])
# Drop the current generation context and clear the Key/Value cache
self.reset_generation()
output_ids = self.generate_tokens(
padded_input_ids,
......@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
Tested with neuron 2.17.0
"""
_DEFAULT_MAX_LENGTH = 2048
def __init__(
self,
pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
......@@ -203,7 +149,7 @@ class NEURON_HF(TemplateLM):
"please install neuron via pip install transformers-neuron ",
"also make sure you are running on an AWS inf2 instance",
)
if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
logger.warning(
'`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
"preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
......@@ -217,35 +163,16 @@ class NEURON_HF(TemplateLM):
self.batch_size_per_gpu = int(batch_size)
batch_size = int(batch_size)
if tp_degree is None:
# execute `neuron-ls --json-output | jq '.[0].nc_count'``
# to get the number of neuron cores on your instance
tp_degree = get_nc_count()
assert isinstance(tp_degree, int), (
f"model_args must include tp_degree. tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it to `12`."
" For inf2.48xlarge, set it to `24`."
)
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
)
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
assert torch_dtype in [
torch.float16,
torch.bfloat16,
], "Only float16 and bfloat16 are supported"
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
......@@ -254,7 +181,20 @@ class NEURON_HF(TemplateLM):
use_fast=use_fast_tokenizer,
)
# Neuron specific code
neuron_config = getattr(self._config, "neuron", None)
if neuron_config is None:
# Check export parameters
if tp_degree is not None:
assert isinstance(tp_degree, int), (
f"tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to a number lower than the number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it <= `12`."
" For inf2.48xlarge, set it <= `24`."
)
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
if torch_dtype == torch.float16:
self.amp_dtype = "f16"
elif torch_dtype == torch.bfloat16:
......@@ -262,28 +202,37 @@ class NEURON_HF(TemplateLM):
elif torch_dtype == torch.float32:
self.amp_dtype = "f32"
else:
raise NotImplementedError("Only float16 and bfloat16 are implemented.")
compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
input_shapes = {
"batch_size": batch_size,
"sequence_length": self._DEFAULT_MAX_LENGTH,
}
raise NotImplementedError(
"Only float16/bfloat16/float32 are supported."
)
print(f"{'='*20} \n exporting model to neuron")
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
export=True,
batch_size=batch_size,
num_cores=tp_degree,
auto_cast_type=self.amp_dtype,
sequence_length=max_length,
)
neuron_config = self.model.config.neuron
print(
f"{'='*20} \n loading model to neuron with"
f" {compiler_args}, {input_shapes}..."
f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
)
else:
print(
f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
)
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
export=True,
**compiler_args,
**input_shapes,
)
print(f"SUCCESS: neuron model compiled. \n {'='*20}")
print(f"SUCCESS: neuron model loaded. \n {'='*20}")
self.truncation = truncation
......@@ -291,8 +240,6 @@ class NEURON_HF(TemplateLM):
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = add_bos_token
self._max_length = max_length
self.batch_schedule = 1
self.batch_sizes = {}
......@@ -313,17 +260,7 @@ class NEURON_HF(TemplateLM):
@property
def max_length(self):
if self._max_length: # if max length manually set, return it
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs:
if hasattr(self.model.config, attr):
return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
return self.model.max_length
@property
def max_gen_toks(self) -> int:
......@@ -391,34 +328,6 @@ class NEURON_HF(TemplateLM):
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
@wrap_constant_batch_size
def _model_call(self, input_ids: torch.Tensor):
"""
get logits for the entire sequence
:param input_ids: torch.Tensor
A torch tensor of shape [batch, sequence_cont]
the size of sequence may vary from call to call
:return
A torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model's decoder-lm head
"""
_, sequence_length = input_ids.shape
with torch.inference_mode():
cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
input_ids_split = input_ids.split(1, dim=1)
return torch.concat(
[
self.model.forward(
input_ids=input_id, cache_ids=cache_id, return_dict=False
)[0]
for input_id, cache_id in zip(input_ids_split, cache_ids)
],
dim=1,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs):
# we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search.
......@@ -580,15 +489,41 @@ class NEURON_HF(TemplateLM):
cont_toks_list.append(continuation_enc)
inplens.append(inplen)
# create encoder attn mask and batched conts, if seq2seq
call_kwargs = {}
# Add dummy inputs up to the model static batch size
if len(inps) < self.batch_size:
inps = inps + [
torch.zeros_like(inps[0]),
] * (self.batch_size - len(inps))
masks = [torch.ones_like(inp) for inp in inps]
batched_inps = lm_eval.models.utils.pad_and_concat(
padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp]
batched_masks = lm_eval.models.utils.pad_and_concat(
padding_len_inp, masks, padding_side="right"
)
if self.model.model.neuron_config.output_all_logits:
inputs = self.model.prepare_inputs_for_prefill(
batched_inps, batched_masks
)
multi_logits = F.log_softmax(
self._model_call(batched_inps, **call_kwargs), dim=-1
self.model.forward(**inputs).logits, dim=-1
) # [batch, padding_length (inp or cont), vocab]
else:
# The model will only return the logits for the last input token, so we need
# to iterate over inputs to accumulate logits.
# To speed things up we use the KV cache as we would do when generating.
inputs = self.model.prepare_inputs_for_prefill(
batched_inps[:, :1], batched_masks[:, :1]
)
outputs = [self.model.forward(**inputs).logits]
for i in range(1, padding_len_inp):
inputs = self.model.prepare_inputs_for_decode(
batched_inps[:, : i + 1], batched_masks[:, : i + 1]
)
outputs.append(self.model.forward(**inputs).logits)
multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)
for (cache_key, _, _), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list
......
......@@ -69,11 +69,11 @@ class LocalCompletionsAPI(TemplateAPI):
for choice, ctxlen in zip(out["choices"], ctxlens):
assert ctxlen > 0, "Context length must be greater than 0"
logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
tokens = choice["logprobs"]["token_logprobs"][ctxlen:-1]
tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
is_greedy = True
for tok, top in zip(tokens, top_logprobs):
if tok != max(top, key=top.get):
for tok, top in zip(tokens_logprobs, top_logprobs):
if tok != max(top.values()):
is_greedy = False
break
res.append((logprobs, is_greedy))
......@@ -190,14 +190,18 @@ class OpenAICompletionsAPI(LocalCompletionsAPI):
key = os.environ.get("OPENAI_API_KEY", None)
if key is None:
raise ValueError(
"API key not found. Please set the OPENAI_API_KEY environment variable."
"API key not found. Please set the `OPENAI_API_KEY` environment variable."
)
return key
def loglikelihood(self, requests, **kwargs):
assert (
self.model != "gpt-3.5-turbo"
), "Loglikelihood is not supported for gpt-3.5-turbo"
self.model
in [
"babbage-002",
"davinci-002",
]
), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
return super().loglikelihood(requests, **kwargs)
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
......@@ -226,6 +230,11 @@ class OpenAIChatCompletion(LocalChatCompletion):
key = os.environ.get("OPENAI_API_KEY", None)
if key is None:
raise ValueError(
"API key not found. Please set the OPENAI_API_KEY environment variable."
"API key not found. Please set the `OPENAI_API_KEY` environment variable."
)
return key
def loglikelihood(self, requests, **kwargs):
raise NotImplementedError(
"Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
)
......@@ -698,3 +698,14 @@ def replace_placeholders(
# Add the last part of the string
result.append(parts[-1])
return "".join(result)
def flatten_image_list(images: List[List]):
"""
Takes in a list of lists of images, and returns a single list of all images in order.
Used for some multimodal models like Llava-1.5 which expects this flattened-list format for its image processor.
:param images: A list of lists of PIL images.
:return: a list of PIL images, via concatenating all the sub-lists in order.
"""
return [image for image_list in images for image in image_list]
......@@ -7,9 +7,9 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, undistribute
from lm_eval.models.utils import Collator, replace_placeholders, undistribute
from lm_eval.models.vllm_causallms import VLLM
from lm_eval.utils import simple_parse_args_string
from lm_eval.utils import eval_logger
try:
......@@ -36,10 +36,11 @@ class VLLM_VLM(VLLM):
interleave: bool = True,
# TODO<baber>: handle max_images and limit_mm_per_prompt better
max_images: int = 999,
limit_mm_per_prompt: str = "image=1",
**kwargs,
):
kwargs["limit_mm_per_prompt"] = simple_parse_args_string(limit_mm_per_prompt)
if max_images != 999:
kwargs["limit_mm_per_prompt"] = {"image": max_images}
eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
super().__init__(
pretrained=pretrained,
trust_remote_code=trust_remote_code,
......@@ -63,6 +64,17 @@ class VLLM_VLM(VLLM):
truncation: bool = False,
):
images = [img[: self.max_images] for img in images]
# TODO<baber>: is the default placeholder always <image>?
if self.chat_applied is False:
strings = [
replace_placeholders(
string,
DEFAULT_IMAGE_PLACEHOLDER,
DEFAULT_IMAGE_PLACEHOLDER,
self.max_images,
)
for string in strings
]
outputs = []
for x, i in zip(strings, images):
......
......@@ -18,6 +18,7 @@
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque |
| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
......@@ -25,6 +26,7 @@
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
......@@ -42,6 +44,7 @@
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
......@@ -86,6 +89,7 @@
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
| [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese |
| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
......@@ -95,6 +99,7 @@
| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English |
| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
......@@ -107,6 +112,7 @@
| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
......
......@@ -40,7 +40,11 @@ class TaskManager:
[x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
)
self._all_subtasks = sorted(
[x for x in self._all_tasks if self._task_index[x]["type"] == "task"]
[
x
for x in self._all_tasks
if self._task_index[x]["type"] in ["task", "python_task"]
]
)
self._all_tags = sorted(
[x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
......@@ -271,7 +275,7 @@ class TaskManager:
task_object = config["class"]()
if isinstance(task_object, ConfigurableTask):
# very scuffed: set task name here. TODO: fixme?
task_object.config.task = config["task"]
task_object.config.task = task
else:
task_object = ConfigurableTask(config=config)
......@@ -436,6 +440,30 @@ class TaskManager:
:return
Dictionary of task names as key and task metadata
"""
def _populate_tags_and_groups(config, task, tasks_and_groups, print_info):
# TODO: remove group in next release
if "tag" in config:
attr_list = config["tag"]
if isinstance(attr_list, str):
attr_list = [attr_list]
for tag in attr_list:
if tag not in tasks_and_groups:
tasks_and_groups[tag] = {
"type": "tag",
"task": [task],
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
break
else:
tasks_and_groups[tag]["task"].append(task)
# TODO: remove group in next release
print_info = True
ignore_dirs = [
......@@ -451,10 +479,14 @@ class TaskManager:
config = utils.load_yaml_config(yaml_path, mode="simple")
if self._config_is_python_task(config):
# This is a python class config
tasks_and_groups[config["task"]] = {
task = config["task"]
tasks_and_groups[task] = {
"type": "python_task",
"yaml_path": yaml_path,
}
_populate_tags_and_groups(
config, task, tasks_and_groups, print_info
)
elif self._config_is_group(config):
# This is a group config
tasks_and_groups[config["group"]] = {
......@@ -483,41 +515,9 @@ class TaskManager:
"type": "task",
"yaml_path": yaml_path,
}
# TODO: remove group in next release
for attr in ["tag", "group"]:
if attr in config:
if attr == "group" and print_info:
self.logger.info(
"`group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. "
"The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. "
"`group`s which aggregate across subtasks must be only defined in a separate group config file, "
"which will be the official way to create groups that support cross-task aggregation as in `mmlu`. "
"Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs "
"for more information."
)
print_info = False
# attr = "tag"
attr_list = config[attr]
if isinstance(attr_list, str):
attr_list = [attr_list]
for tag in attr_list:
if tag not in tasks_and_groups:
tasks_and_groups[tag] = {
"type": "tag",
"task": [task],
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
f"The tag {tag} is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
_populate_tags_and_groups(
config, task, tasks_and_groups, print_info
)
break
else:
tasks_and_groups[tag]["task"].append(task)
else:
self.logger.debug(f"File {f} in {root} could not be loaded")
......
# BasqueBench
### Paper
BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
The new evaluation datasets included in BasqueBench are:
| Task | Category | Homepage |
|:-------------:|:-----:|:-----:|
| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
The datasets included in BasqueBench that have been made public in previous pubications are:
| Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:|
| Belebele_eu | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
| EusExams | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusExams |
| EusProficiency | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusProficiency |
| EusReading | Reading Comprehension | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusReading |
| EusTrivia | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusTrivia |
| FLORES_eu | Translation | [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) | https://huggingface.co/datasets/facebook/flores |
| QNLIeu | Natural Language Inference | [BasqueGLUE: A Natural Language Understanding Benchmark for Basque](https://aclanthology.org/2022.lrec-1.172/) | https://huggingface.co/datasets/orai-nlp/basqueGLUE |
| XNLIeu | Natural Language Inference | [XNLIeu: a dataset for cross-lingual NLI in Basque](https://arxiv.org/abs/2404.06996) | https://huggingface.co/datasets/HiTZ/xnli-eu |
| XStoryCloze_eu | Commonsense Reasoning | [Few-shot Learning with Multilingual Generative Language Models](https://aclanthology.org/2022.emnlp-main.616/) | https://huggingface.co/datasets/juletxara/xstory_cloze |
### Citation
Paper for BasqueBench coming soon.
### Groups and Tasks
#### Groups
- `basque_bench`: All tasks included in BasqueBench.
- `flores_eu`: All FLORES translation tasks from or to Basque.
#### Tasks
The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
- `belebele_eus_Latn`
- `eus_exams_eu`
- `eus_proficiency`
- `eus_reading`
- `eus_trivia`
- `flores_eu`
- `flores_eu-ca`
- `flores_eu-de`
- `flores_eu-en`
- `flores_eu-es`
- `flores_eu-fr`
- `flores_eu-gl`
- `flores_eu-it`
- `flores_eu-pt`
- `flores_ca-eu`
- `flores_de-eu`
- `flores_en-eu`
- `flores_es-eu`
- `flores_fr-eu`
- `flores_gl-eu`
- `flores_it-eu`
- `flores_pt-eu`
- `mgsm_direct_eu`
- `mgsm_native_cot_eu`
- `qnlieu`
- `wnli_eu`
- `xcopa_eu`
- `xnli_eu`
- `xnli_eu_native`
- `xstorycloze_eu`
Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
- `belebele_eus_Latn`: Belebele Basque
- `qnlieu`: From BasqueGLUE
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation?
* [ ] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: basque_bench
task:
- belebele_eus_Latn
- xstorycloze_eu
- flores_eu
- eus_reading
- eus_proficiency
- eus_trivia
- eus_exams_eu
- qnlieu
- xnli_eu
- xnli_eu_native
- wnli_eu
- xcopa_eu
- mgsm_direct_eu
- mgsm_native_cot_eu
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment