Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/hf_vlms.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/hf_vlms.py
25869601 · Baber · 56f40c53 · c1d8795d · 25869601 · 25869601
Commit 25869601 authored Oct 19, 2024 by Baber
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ build
 dist
 *.egg-info
 venv
+.venv/
 .vscode/
 temp
 __pycache__

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 exclude: ^tests/testdata/
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
    hooks:
      - id: check-added-large-files
      - id: check-ast
@@ -29,7 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.8
+    rev: v0.6.8
    hooks:
      # Run the linter.
      - id: ruff

--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's pop
 To install the `lm-eval` package from the github repository, run:
 ```bash
-git clone https://github.com/EleutherAI/lm-evaluation-harness
+git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
 cd lm-evaluation-harness
 pip install -e .
 ```

--- a/examples/lm-eval-overview.ipynb
+++ b/examples/lm-eval-overview.ipynb
--- a/examples/visualize-wandb.ipynb
+++ b/examples/visualize-wandb.ipynb
@@ -68,6 +68,7 @@
   "source": [
    "import wandb\n",
    "\n",
+    "\n",
    "wandb.login()"
   ]
  },
@@ -130,6 +131,7 @@
    "import lm_eval\n",
    "from lm_eval.loggers import WandbLogger\n",
    "\n",
+    "\n",
    "results = lm_eval.simple_evaluate(\n",
    "    model=\"hf\",\n",
    "    model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n",

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -431,7 +431,12 @@ class TemplateLM(LM):
        using_default_template = False
        # First, handle the cases when the model has a dict of multiple templates
-        template = self.tokenizer.chat_template or self.tokenizer.default_chat_template
+        try:
+            template = (
+                self.tokenizer.chat_template or self.tokenizer.default_chat_template
+            )
+        except AttributeError:
+            return None
        if isinstance(template, dict):
            using_default_dict = self.tokenizer.chat_template is None

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -57,7 +57,6 @@ class TaskConfig(dict):
    task: Optional[str] = None
    task_alias: Optional[str] = None
    tag: Optional[Union[str, list]] = None
-    group: Optional[Union[str, list]] = None
    # HF dataset options.
    # which dataset to use,
    # and what splits for what purpose
@@ -98,18 +97,6 @@ class TaskConfig(dict):
    )
    def __post_init__(self) -> None:
-        if self.group is not None:
-            eval_logger.warning(
-                "A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
-            )
-            if self.tag is None:
-                self.tag = self.group
-            else:
-                raise ValueError(
-                    "Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
-                )
        if self.generation_kwargs is not None:
            if self.output_type != "generate_until":
                eval_logger.warning(
@@ -1511,7 +1498,7 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            elif type(gold) != type(result):
+            elif type(gold) is not type(result):
                # cast gold to the same type as result
                gold = type(result)(gold)
@@ -1594,7 +1581,7 @@ class ConfigurableTask(Task):
            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
            f"output_type={self.OUTPUT_TYPE},"
            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
-            f"num_samples={len(self.eval_docs)})",
+            f"num_samples={len(self.eval_docs)})"
        )

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -157,6 +157,9 @@ def simple_evaluate(
        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
        torch.manual_seed(torch_random_seed)
+    if fewshot_random_seed is not None:
+        seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
    if seed_message:
        eval_logger.info(" | ".join(seed_message))
@@ -276,9 +279,6 @@ def simple_evaluate(
                        task_obj.set_config(key="num_fewshot", value=0)
                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
-                eval_logger.info(
-                    f"Setting fewshot random generator seed to {fewshot_random_seed}"
-                )
                adjusted_task_dict[task_name] = task_obj
@@ -433,10 +433,14 @@ def evaluate(
            )
    # end multimodality validation check
+    # Cache the limit arg.
+    limit_arg = limit
+    limits = []
    for task_output in eval_tasks:
        task: Task = task_output.task
-        limit = get_sample_size(task, limit)
+        limit = get_sample_size(task, limit_arg)
+        limits.append(limit)
        task.build_all_requests(
            limit=limit,
            rank=lm.rank,
@@ -506,7 +510,7 @@ def evaluate(
    WORLD_SIZE = lm.world_size
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
-    for task_output in eval_tasks:
+    for task_output, limit in zip(eval_tasks, limits):
        task = task_output.task
        task.apply_filters()
@@ -655,7 +659,7 @@ def evaluate(
                        len(task_output.task.eval_docs),
                    ),
                }
-                for task_output in eval_tasks
+                for task_output, limit in zip(eval_tasks, limits)
            },
        }
        if log_samples:

--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -73,9 +73,12 @@ class TemplateAPI(TemplateLM):
        seed: int = 1234,
        max_length: Optional[int] = 2048,
        add_bos_token: bool = False,
-        custom_prefix_token_id=None,
+        custom_prefix_token_id: int = None,
        # send the requests as tokens or strings
-        tokenized_requests=True,
+        tokenized_requests: bool = True,
+        trust_remote_code: bool = False,
+        revision: Optional[str] = "main",
+        use_fast_tokenizer: bool = True,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -128,7 +131,10 @@ class TemplateAPI(TemplateLM):
                    import transformers
                    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                        self.tokenizer if self.tokenizer else self.model
+                        self.tokenizer if self.tokenizer else self.model,
+                        trust_remote_code=trust_remote_code,
+                        revision=revision,
+                        use_fast=use_fast_tokenizer,
                    )
                    # Not used as the API will handle padding but to mirror the behavior of the HFLM
                    self.tokenizer = configure_pad_token(self.tokenizer)
@@ -153,6 +159,9 @@ class TemplateAPI(TemplateLM):
                assert isinstance(tokenizer, str), "tokenizer must be a string"
                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
                    tokenizer,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    use_fast=use_fast_tokenizer,
                )
    @abc.abstractmethod

--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -26,9 +26,9 @@ class DummyLM(LM):
    def generate_until(self, requests, disable_tqdm: bool = False):
        res = []
-        for ctx, _ in tqdm(requests, disable=disable_tqdm):
+        for request in tqdm(requests, disable=disable_tqdm):
            res.append("lol")
-            assert ctx.strip() != ""
+            assert request.arguments[0].strip() != ""
        return res

--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -13,6 +13,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
 from lm_eval.models.utils import (
    Collator,
+    flatten_image_list,
    pad_and_concat,
    replace_placeholders,
    stop_sequences_criteria,
@@ -295,6 +296,11 @@ class HFMultimodalLM(HFLM):
        images = [img[: self.max_images] for img in images]
        if self.rgb:
            images = [[img.convert("RGB") for img in sublist] for sublist in images]
+        # certain models like llava expect a single-level image list even for bs>1, multi-image. TODO: port this over to loglikelihoods
+        if getattr(self.config, "model_type", "") == "llava":
+            images = flatten_image_list(images)
        try:
            encoding = self.processor(
                images=images,

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -55,7 +55,7 @@ class HFLM(TemplateLM):
    def __init__(
        self,
        pretrained: Union[str, transformers.PreTrainedModel],
-        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        backend: Literal["default", "causal", "seq2seq"] = "default",
        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
        revision: Optional[str] = "main",
        subfolder: Optional[str] = None,
@@ -90,7 +90,6 @@ class HFLM(TemplateLM):
        **kwargs,
    ) -> None:
        super().__init__()
        # optionally: take in an already-initialized transformers.PreTrainedModel
        if not isinstance(pretrained, str):
            eval_logger.warning(
@@ -164,7 +163,7 @@ class HFLM(TemplateLM):
                trust_remote_code=trust_remote_code,
            )
-        # determine which of 'causal' and 'seq2seq' backends to use
+            # determine which of 'causal' and 'seq2seq' backends to use for HF models
        self._get_backend(
            config=self.config, backend=backend, trust_remote_code=trust_remote_code
        )
@@ -287,7 +286,7 @@ class HFLM(TemplateLM):
    def _get_accelerate_args(
        self,
-        parallelize: bool = None,
+        parallelize: Optional[bool] = None,
        device_map: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
@@ -441,31 +440,26 @@ class HFLM(TemplateLM):
    def _get_backend(
        self,
        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
-        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        backend: Literal["default", "causal", "seq2seq"] = "default",
        trust_remote_code: Optional[bool] = False,
    ) -> None:
        """
        Helper method during initialization.
-        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
+        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
-        model type to be used.
        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
+        **If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
+        user must set `self.backend` to be either "causal" or "seq2seq" manually!**
        """
-        # escape hatch: if we're using a subclass that shouldn't follow
-        # the default _get_backend logic,
-        # then skip over the method.
-        # TODO: this seems very much undesirable in some cases--our code in HFLM
-        # references AutoModelForCausalLM at times to check for equality
-        if self.AUTO_MODEL_CLASS is not None:
-            return
        assert backend in ["default", "causal", "seq2seq"]
        if backend != "default":
            # if we've settled on non-default backend, use that manually
            if backend == "causal":
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+                self.backend = backend
            elif backend == "seq2seq":
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+                self.backend = backend
            eval_logger.info(
                f"Overrode HF model backend type, and using type '{backend}'"
            )
@@ -478,26 +472,32 @@ class HFLM(TemplateLM):
                # first check if model type is listed under seq2seq models, since some
                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
                # these special cases should be treated as seq2seq models.
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+                self.backend = "seq2seq"
+                eval_logger.info(f"Using model type '{backend}'")
            elif (
                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
            ):
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+                self.backend = "causal"
+                eval_logger.info(f"Using model type '{backend}'")
            else:
                if not trust_remote_code:
                    eval_logger.warning(
                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
+                        "Setting backend to causal"
                    )
                # if model type is neither in HF transformers causal or seq2seq model registries
-                # then we default to AutoModelForCausalLM
+                # then we default to assuming AutoModelForCausalLM
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+                self.backend = "causal"
+                eval_logger.info(
+                    f"Model type cannot be determined. Using default model type '{backend}'"
+                )
-        assert self.AUTO_MODEL_CLASS in [
+        if self.AUTO_MODEL_CLASS is None:
-            transformers.AutoModelForCausalLM,
+            if self.backend == "causal":
-            transformers.AutoModelForSeq2SeqLM,
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
-        ]
+            elif self.backend == "seq2seq":
-        return None
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
    def _get_config(
        self,
@@ -505,6 +505,7 @@ class HFLM(TemplateLM):
        revision: str = "main",
        trust_remote_code: bool = False,
    ) -> None:
+        """Return the model config for HuggingFace models"""
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
@@ -703,7 +704,7 @@ class HFLM(TemplateLM):
        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
        def forward_batch(batch_size):
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            if self.backend == "seq2seq":
                length = max(max_context_enc, max_cont_enc)
                batched_conts = torch.ones(
                    (batch_size, length), device=self.device
@@ -754,7 +755,7 @@ class HFLM(TemplateLM):
        # by default for CausalLM - false or self.add_bos_token is set
        if add_special_tokens is None:
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            if self.backend == "causal":
                special_tokens_kwargs = {
                    "add_special_tokens": False or self.add_bos_token
                }
@@ -782,7 +783,7 @@ class HFLM(TemplateLM):
        self.tokenizer.padding_side = padding_side
        add_special_tokens = {}
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+        if self.backend == "causal":
            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
        encoding = self.tokenizer(
@@ -860,14 +861,14 @@ class HFLM(TemplateLM):
    def _select_cont_toks(
        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
    ) -> torch.Tensor:
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+        if self.backend == "causal":
            assert (
                contlen and inplen
            ), "Must pass input len and cont. len to select scored logits for causal LM"
            # discard right-padding.
            # also discard the input/context tokens. we'll only score continuations.
            logits = logits[inplen - contlen : inplen]
-        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+        elif self.backend == "seq2seq":
            assert (
                contlen and not inplen
            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
@@ -990,8 +991,7 @@ class HFLM(TemplateLM):
            requests,
            sort_fn=_collate,
            group_by="contexts"
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            if self.backend == "causal" and self.logits_cache
-            and self.logits_cache
            else None,
            group_fn=_lookup_one_token_cont,
        )
@@ -1048,14 +1048,14 @@ class HFLM(TemplateLM):
                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
                # when too long to fit in context, truncate from the left
-                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                if self.backend == "causal":
                    inp = torch.tensor(
                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
                        dtype=torch.long,
                        device=self.device,
                    )
                    (inplen,) = inp.shape
-                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                elif self.backend == "seq2seq":
                    inp = torch.tensor(
                        (context_enc)[-self.max_length :],
                        dtype=torch.long,
@@ -1095,11 +1095,11 @@ class HFLM(TemplateLM):
            # create encoder attn mask and batched conts, if seq2seq
            call_kwargs = {}
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            if self.backend == "causal":
                batched_inps = pad_and_concat(
                    padding_len_inp, inps, padding_side="right"
                )  # [batch, padding_len_inp]
-            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            elif self.backend == "seq2seq":
                # TODO: left-pad encoder inps and mask?
                batched_inps = pad_and_concat(
                    padding_len_inp, inps
@@ -1130,7 +1130,7 @@ class HFLM(TemplateLM):
                # from prompt/prefix tuning tokens, if applicable
                ctx_len = (
                    inplen + (logits.shape[0] - padding_len_inp)
-                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    if self.backend == "causal"
                    else None
                )
                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
@@ -1265,10 +1265,10 @@ class HFLM(TemplateLM):
                max_gen_toks = self.max_gen_toks
            # set the max length in tokens of inputs ("context_enc")
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            if self.backend == "causal":
                # max len for inputs = max length, minus room to generate the max new tokens
                max_ctx_len = self.max_length - max_gen_toks
-            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            elif self.backend == "seq2seq":
                # max len for inputs = encoder's whole max_length
                max_ctx_len = self.max_length
@@ -1295,7 +1295,7 @@ class HFLM(TemplateLM):
            cont_toks_list = cont.tolist()
            for cont_toks, context in zip(cont_toks_list, contexts):
                # discard context + left-padding toks if using causal decoder-only LM
-                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                if self.backend == "causal":
                    cont_toks = cont_toks[context_enc.shape[1] :]
                s = self.tok_decode(cont_toks)

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
 import copy
-import json
 import logging
-import subprocess
 from collections import defaultdict
 from typing import List, Optional, Union
@@ -33,54 +31,6 @@ except ImportError:
 logger = logging.getLogger(__name__)
-def get_nc_count() -> Union[int, None]:
-    """Returns the number of neuron cores on the current instance."""
-    try:
-        cmd = "neuron-ls --json-output"
-        result = subprocess.run(cmd, shell=True, capture_output=True)
-        print(f"inferring nc_count from `neuron-ls` {result.stdout}")
-        json_output = json.loads(result.stdout)
-        count = sum([x["nc_count"] for x in json_output])
-        print(f"nc_count={count}")
-        return count
-    except Exception:
-        return None
-def wrap_constant_batch_size(func):
-    def _decorator(self, input_ids):
-        """input_ids a 2D array with batch_size on dim=0
-        makes sure the func runs with self.batch_size
-        """
-        # access a from TestSample
-        batch_size = input_ids.shape[0]
-        if batch_size < self.batch_size:
-            # handle the event of input_ids.shape[0] != batch_size
-            # Neuron cores expect constant batch_size
-            input_ids = torch.concat(
-                (
-                    input_ids,
-                    # add missing_batch_size dummy
-                    torch.zeros(
-                        [self.batch_size - batch_size, *input_ids.size()[1:]],
-                        dtype=input_ids.dtype,
-                        device=input_ids.device,
-                    ),
-                ),
-                dim=0,
-            )
-        elif batch_size > self.batch_size:
-            raise ValueError(
-                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
-            )
-        # return the forward pass that requires constant batch size
-        return func(self, input_ids)[:batch_size]
-    return _decorator
 class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
    """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
            raise ValueError(
                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
            )
-        elif batch_size < self.batch_size:
+        elif batch_size < self.batch_size and not self.continuous_batching:
            logger.warning(
                "Inputs will be padded to match the model static batch size. This will increase latency."
            )
@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
            if attention_mask is not None:
                padding = torch.zeros(padding_shape, dtype=torch.int64)
                padded_attention_mask = torch.cat([attention_mask, padding])
-        # Drop the current generation context and clear the Key/Value cache
-        self.reset_generation()
        output_ids = self.generate_tokens(
            padded_input_ids,
@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
    Tested with neuron 2.17.0
    """
-    _DEFAULT_MAX_LENGTH = 2048
    def __init__(
        self,
        pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -203,7 +149,7 @@ class NEURON_HF(TemplateLM):
                "please install neuron via pip install transformers-neuron ",
                "also make sure you are running on an AWS inf2 instance",
            )
-        if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
+        if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
            logger.warning(
                '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
                "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
@@ -217,35 +163,16 @@ class NEURON_HF(TemplateLM):
        self.batch_size_per_gpu = int(batch_size)
        batch_size = int(batch_size)
-        if tp_degree is None:
-            # execute `neuron-ls --json-output | jq '.[0].nc_count'``
-            # to get the number of neuron cores on your instance
-            tp_degree = get_nc_count()
-        assert isinstance(tp_degree, int), (
-            f"model_args must include tp_degree. tp_degree must be set to an integer,"
-            f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
-            "Set it to number of neuron cores on your instance."
-            " For inf2.xlarge and inf2.8xlarge, set it to `2`."
-            " For inf2.24xlarge, set it to `12`."
-            " For inf2.48xlarge, set it to `24`."
-        )
-        revision = str(revision)  # cast to string if not already one
-        # TODO: update this to be less of a hack once subfolder is fixed in HF
-        revision = revision + ("/" + subfolder if subfolder is not None else "")
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
-        torch_dtype = lm_eval.models.utils.get_dtype(dtype)
-        assert torch_dtype in [
+        revision = str(revision)  # cast to string if not already one
-            torch.float16,
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
-            torch.bfloat16,
+        revision = revision + ("/" + subfolder if subfolder is not None else "")
-        ], "Only float16 and bfloat16 are supported"
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            pretrained if tokenizer is None else tokenizer,
@@ -254,36 +181,58 @@ class NEURON_HF(TemplateLM):
            use_fast=use_fast_tokenizer,
        )
-        # Neuron specific code
+        neuron_config = getattr(self._config, "neuron", None)
-        if torch_dtype == torch.float16:
+        if neuron_config is None:
-            self.amp_dtype = "f16"
+            # Check export parameters
-        elif torch_dtype == torch.bfloat16:
+            if tp_degree is not None:
-            self.amp_dtype = "bf16"
+                assert isinstance(tp_degree, int), (
-        elif torch_dtype == torch.float32:
+                    f"tp_degree must be set to an integer,"
-            self.amp_dtype = "f32"
+                    f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
-        else:
+                    "Set it to a number lower than the number of neuron cores on your instance."
-            raise NotImplementedError("Only float16 and bfloat16 are implemented.")
+                    " For inf2.xlarge and inf2.8xlarge, set it to `2`."
+                    " For inf2.24xlarge, set it <= `12`."
-        compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
+                    " For inf2.48xlarge, set it <= `24`."
-        input_shapes = {
+                )
-            "batch_size": batch_size,
+            torch_dtype = lm_eval.models.utils.get_dtype(dtype)
-            "sequence_length": self._DEFAULT_MAX_LENGTH,
-        }
+            if torch_dtype == torch.float16:
+                self.amp_dtype = "f16"
+            elif torch_dtype == torch.bfloat16:
+                self.amp_dtype = "bf16"
+            elif torch_dtype == torch.float32:
+                self.amp_dtype = "f32"
+            else:
+                raise NotImplementedError(
+                    "Only float16/bfloat16/float32 are supported."
+                )
-        print(
+            print(f"{'='*20} \n exporting model to neuron")
-            f"{'='*20} \n loading model to neuron with"
+            self.model = CustomNeuronModelForCausalLM.from_pretrained(
-            f" {compiler_args}, {input_shapes}..."
+                pretrained,
-        )
+                revision=revision,
-        self.model = CustomNeuronModelForCausalLM.from_pretrained(
+                trust_remote_code=trust_remote_code,
-            pretrained,
+                low_cpu_mem_usage=low_cpu_mem_usage,
-            revision=revision,
+                export=True,
-            trust_remote_code=trust_remote_code,
+                batch_size=batch_size,
-            low_cpu_mem_usage=low_cpu_mem_usage,
+                num_cores=tp_degree,
-            export=True,
+                auto_cast_type=self.amp_dtype,
-            **compiler_args,
+                sequence_length=max_length,
-            **input_shapes,
+            )
-        )
+            neuron_config = self.model.config.neuron
-        print(f"SUCCESS: neuron model compiled. \n {'='*20}")
+            print(
+                f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
+            )
+        else:
+            print(
+                f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
+            )
+            self.model = CustomNeuronModelForCausalLM.from_pretrained(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+            )
+            print(f"SUCCESS: neuron model loaded. \n {'='*20}")
        self.truncation = truncation
@@ -291,8 +240,6 @@ class NEURON_HF(TemplateLM):
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
        self.add_bos_token = add_bos_token
-        self._max_length = max_length
        self.batch_schedule = 1
        self.batch_sizes = {}
@@ -313,17 +260,7 @@ class NEURON_HF(TemplateLM):
    @property
    def max_length(self):
-        if self._max_length:  # if max length manually set, return it
+        return self.model.max_length
-            return self._max_length
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-        for attr in seqlen_config_attrs:
-            if hasattr(self.model.config, attr):
-                return getattr(self.model.config, attr)
-        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
    @property
    def max_gen_toks(self) -> int:
@@ -391,34 +328,6 @@ class NEURON_HF(TemplateLM):
    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)
-    @wrap_constant_batch_size
-    def _model_call(self, input_ids: torch.Tensor):
-        """
-        get logits for the entire sequence
-        :param input_ids: torch.Tensor
-            A torch tensor of shape [batch, sequence_cont]
-            the size of sequence may vary from call to call
-        :return
-            A torch tensor of shape [batch, sequence, vocab] with the
-            logits returned from the model's decoder-lm head
-        """
-        _, sequence_length = input_ids.shape
-        with torch.inference_mode():
-            cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
-            input_ids_split = input_ids.split(1, dim=1)
-            return torch.concat(
-                [
-                    self.model.forward(
-                        input_ids=input_id, cache_ids=cache_id, return_dict=False
-                    )[0]
-                    for input_id, cache_id in zip(input_ids_split, cache_ids)
-                ],
-                dim=1,
-            )
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
        # we require users to pass do_sample=True explicitly
        # for non-greedy gen. This should be reevaluated when considering beam search.
@@ -580,15 +489,41 @@ class NEURON_HF(TemplateLM):
                cont_toks_list.append(continuation_enc)
                inplens.append(inplen)
-            # create encoder attn mask and batched conts, if seq2seq
+            # Add dummy inputs up to the model static batch size
-            call_kwargs = {}
+            if len(inps) < self.batch_size:
+                inps = inps + [
+                    torch.zeros_like(inps[0]),
+                ] * (self.batch_size - len(inps))
+            masks = [torch.ones_like(inp) for inp in inps]
            batched_inps = lm_eval.models.utils.pad_and_concat(
                padding_len_inp, inps, padding_side="right"
            )  # [batch, padding_len_inp]
-            multi_logits = F.log_softmax(
+            batched_masks = lm_eval.models.utils.pad_and_concat(
-                self._model_call(batched_inps, **call_kwargs), dim=-1
+                padding_len_inp, masks, padding_side="right"
-            )  # [batch, padding_length (inp or cont), vocab]
+            )
+            if self.model.model.neuron_config.output_all_logits:
+                inputs = self.model.prepare_inputs_for_prefill(
+                    batched_inps, batched_masks
+                )
+                multi_logits = F.log_softmax(
+                    self.model.forward(**inputs).logits, dim=-1
+                )  # [batch, padding_length (inp or cont), vocab]
+            else:
+                # The model will only return the logits for the last input token, so we need
+                # to iterate over inputs to accumulate logits.
+                # To speed things up we use the KV cache as we would do when generating.
+                inputs = self.model.prepare_inputs_for_prefill(
+                    batched_inps[:, :1], batched_masks[:, :1]
+                )
+                outputs = [self.model.forward(**inputs).logits]
+                for i in range(1, padding_len_inp):
+                    inputs = self.model.prepare_inputs_for_decode(
+                        batched_inps[:, : i + 1], batched_masks[:, : i + 1]
+                    )
+                    outputs.append(self.model.forward(**inputs).logits)
+                multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)
            for (cache_key, _, _), logits, inplen, cont_toks in zip(
                chunk, multi_logits, inplens, cont_toks_list

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -69,11 +69,11 @@ class LocalCompletionsAPI(TemplateAPI):
            for choice, ctxlen in zip(out["choices"], ctxlens):
                assert ctxlen > 0, "Context length must be greater than 0"
                logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
-                tokens = choice["logprobs"]["token_logprobs"][ctxlen:-1]
+                tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
                top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
                is_greedy = True
-                for tok, top in zip(tokens, top_logprobs):
+                for tok, top in zip(tokens_logprobs, top_logprobs):
-                    if tok != max(top, key=top.get):
+                    if tok != max(top.values()):
                        is_greedy = False
                        break
                res.append((logprobs, is_greedy))
@@ -190,14 +190,18 @@ class OpenAICompletionsAPI(LocalCompletionsAPI):
        key = os.environ.get("OPENAI_API_KEY", None)
        if key is None:
            raise ValueError(
-                "API key not found. Please set the OPENAI_API_KEY environment variable."
+                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
            )
        return key
    def loglikelihood(self, requests, **kwargs):
        assert (
-            self.model != "gpt-3.5-turbo"
+            self.model
-        ), "Loglikelihood is not supported for gpt-3.5-turbo"
+            in [
+                "babbage-002",
+                "davinci-002",
+            ]
+        ), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
        return super().loglikelihood(requests, **kwargs)
    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
@@ -226,6 +230,11 @@ class OpenAIChatCompletion(LocalChatCompletion):
        key = os.environ.get("OPENAI_API_KEY", None)
        if key is None:
            raise ValueError(
-                "API key not found. Please set the OPENAI_API_KEY environment variable."
+                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
            )
        return key
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
+        )
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -698,3 +698,14 @@ def replace_placeholders(
    # Add the last part of the string
    result.append(parts[-1])
    return "".join(result)
+def flatten_image_list(images: List[List]):
+    """
+    Takes in a list of lists of images, and returns a single list of all images in order.
+    Used for some multimodal models like Llava-1.5 which expects this flattened-list format for its image processor.
+    :param images: A list of lists of PIL images.
+    :return: a list of PIL images, via concatenating all the sub-lists in order.
+    """
+    return [image for image_list in images for image in image_list]
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -7,9 +7,9 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.utils import Collator, replace_placeholders, undistribute
 from lm_eval.models.vllm_causallms import VLLM
-from lm_eval.utils import simple_parse_args_string
+from lm_eval.utils import eval_logger
 try:
@@ -36,10 +36,11 @@ class VLLM_VLM(VLLM):
        interleave: bool = True,
        # TODO<baber>: handle max_images and limit_mm_per_prompt better
        max_images: int = 999,
-        limit_mm_per_prompt: str = "image=1",
        **kwargs,
    ):
-        kwargs["limit_mm_per_prompt"] = simple_parse_args_string(limit_mm_per_prompt)
+        if max_images != 999:
+            kwargs["limit_mm_per_prompt"] = {"image": max_images}
+            eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
        super().__init__(
            pretrained=pretrained,
            trust_remote_code=trust_remote_code,
@@ -63,6 +64,17 @@ class VLLM_VLM(VLLM):
        truncation: bool = False,
    ):
        images = [img[: self.max_images] for img in images]
+        # TODO<baber>: is the default placeholder always <image>?
+        if self.chat_applied is False:
+            strings = [
+                replace_placeholders(
+                    string,
+                    DEFAULT_IMAGE_PLACEHOLDER,
+                    DEFAULT_IMAGE_PLACEHOLDER,
+                    self.max_images,
+                )
+                for string in strings
+            ]
        outputs = []
        for x, i in zip(strings, images):

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -18,6 +18,7 @@
 | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
 | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
 | [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
+| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque |
 | [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
 | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
 | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
@@ -25,6 +26,7 @@
 | [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
 | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
 | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
+| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
 | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
 | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
 | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
@@ -42,6 +44,7 @@
 | [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
 | [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
 | [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
+| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
 | [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
 | [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
 | [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
@@ -86,6 +89,7 @@
 | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
 | [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
 | [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
+| [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese |
 | [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
 | [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
 | [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
@@ -95,6 +99,7 @@
 | [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
 | [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
 | [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English |
+| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
 | [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
 | [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
 | [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
@@ -107,6 +112,7 @@
 | [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
 | [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
 | [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
+| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
 | [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
 | [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
 | [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -40,7 +40,11 @@ class TaskManager:
            [x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
        )
        self._all_subtasks = sorted(
-            [x for x in self._all_tasks if self._task_index[x]["type"] == "task"]
+            [
+                x
+                for x in self._all_tasks
+                if self._task_index[x]["type"] in ["task", "python_task"]
+            ]
        )
        self._all_tags = sorted(
            [x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
@@ -271,7 +275,7 @@ class TaskManager:
                    task_object = config["class"]()
                if isinstance(task_object, ConfigurableTask):
                    # very scuffed: set task name here. TODO: fixme?
-                    task_object.config.task = config["task"]
+                    task_object.config.task = task
            else:
                task_object = ConfigurableTask(config=config)
@@ -436,6 +440,30 @@ class TaskManager:
        :return
            Dictionary of task names as key and task metadata
        """
+        def _populate_tags_and_groups(config, task, tasks_and_groups, print_info):
+            # TODO: remove group in next release
+            if "tag" in config:
+                attr_list = config["tag"]
+                if isinstance(attr_list, str):
+                    attr_list = [attr_list]
+                for tag in attr_list:
+                    if tag not in tasks_and_groups:
+                        tasks_and_groups[tag] = {
+                            "type": "tag",
+                            "task": [task],
+                            "yaml_path": -1,
+                        }
+                    elif tasks_and_groups[tag]["type"] != "tag":
+                        self.logger.info(
+                            f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
+                            "This may affect tasks you want to call."
+                        )
+                        break
+                    else:
+                        tasks_and_groups[tag]["task"].append(task)
        # TODO: remove group in next release
        print_info = True
        ignore_dirs = [
@@ -451,10 +479,14 @@ class TaskManager:
                    config = utils.load_yaml_config(yaml_path, mode="simple")
                    if self._config_is_python_task(config):
                        # This is a python class config
-                        tasks_and_groups[config["task"]] = {
+                        task = config["task"]
+                        tasks_and_groups[task] = {
                            "type": "python_task",
                            "yaml_path": yaml_path,
                        }
+                        _populate_tags_and_groups(
+                            config, task, tasks_and_groups, print_info
+                        )
                    elif self._config_is_group(config):
                        # This is a group config
                        tasks_and_groups[config["group"]] = {
@@ -483,41 +515,9 @@ class TaskManager:
                            "type": "task",
                            "yaml_path": yaml_path,
                        }
+                        _populate_tags_and_groups(
-                        # TODO: remove group in next release
+                            config, task, tasks_and_groups, print_info
-                        for attr in ["tag", "group"]:
+                        )
-                            if attr in config:
-                                if attr == "group" and print_info:
-                                    self.logger.info(
-                                        "`group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. "
-                                        "The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. "
-                                        "`group`s which aggregate across subtasks must be only defined in a separate group config file, "
-                                        "which will be the official way to create groups that support cross-task aggregation as in `mmlu`. "
-                                        "Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs "
-                                        "for more information."
-                                    )
-                                    print_info = False
-                                    # attr = "tag"
-                                attr_list = config[attr]
-                                if isinstance(attr_list, str):
-                                    attr_list = [attr_list]
-                                for tag in attr_list:
-                                    if tag not in tasks_and_groups:
-                                        tasks_and_groups[tag] = {
-                                            "type": "tag",
-                                            "task": [task],
-                                            "yaml_path": -1,
-                                        }
-                                    elif tasks_and_groups[tag]["type"] != "tag":
-                                        self.logger.info(
-                                            f"The tag {tag} is already registered as a group, this tag will not be registered. "
-                                            "This may affect tasks you want to call."
-                                        )
-                                        break
-                                    else:
-                                        tasks_and_groups[tag]["task"].append(task)
                    else:
                        self.logger.debug(f"File {f} in {root} could not be loaded")

--- a/lm_eval/tasks/basque_bench/README.md
+++ b/lm_eval/tasks/basque_bench/README.md
+# BasqueBench
+### Paper
+BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
+The new evaluation datasets included in BasqueBench are:
+| Task          | Category       | Homepage  |
+|:-------------:|:-----:|:-----:|
+| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
+| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
+| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
+The datasets included in BasqueBench that have been made public in previous pubications are:
+| Task          | Category       | Paper title          | Homepage  |
+|:-------------:|:-----:|:-------------:|:-----:|
+| Belebele_eu | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
+| EusExams | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusExams |
+| EusProficiency | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusProficiency |
+| EusReading | Reading Comprehension | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusReading |
+| EusTrivia | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusTrivia |
+| FLORES_eu | Translation | [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) | https://huggingface.co/datasets/facebook/flores |
+| QNLIeu | Natural Language Inference | [BasqueGLUE: A Natural Language Understanding Benchmark for Basque](https://aclanthology.org/2022.lrec-1.172/) | https://huggingface.co/datasets/orai-nlp/basqueGLUE |
+| XNLIeu | Natural Language Inference | [XNLIeu: a dataset for cross-lingual NLI in Basque](https://arxiv.org/abs/2404.06996) | https://huggingface.co/datasets/HiTZ/xnli-eu |
+| XStoryCloze_eu | Commonsense Reasoning | [Few-shot Learning with Multilingual Generative Language Models](https://aclanthology.org/2022.emnlp-main.616/) | https://huggingface.co/datasets/juletxara/xstory_cloze |
+### Citation
+Paper for BasqueBench coming soon.
+### Groups and Tasks
+#### Groups
+- `basque_bench`: All tasks included in BasqueBench.
+- `flores_eu`: All FLORES translation tasks from or to Basque.
+#### Tasks
+The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
+  - `belebele_eus_Latn`
+  - `eus_exams_eu`
+  - `eus_proficiency`
+  - `eus_reading`
+  - `eus_trivia`
+  - `flores_eu`
+  - `flores_eu-ca`
+  - `flores_eu-de`
+  - `flores_eu-en`
+  - `flores_eu-es`
+  - `flores_eu-fr`
+  - `flores_eu-gl`
+  - `flores_eu-it`
+  - `flores_eu-pt`
+  - `flores_ca-eu`
+  - `flores_de-eu`
+  - `flores_en-eu`
+  - `flores_es-eu`
+  - `flores_fr-eu`
+  - `flores_gl-eu`
+  - `flores_it-eu`
+  - `flores_pt-eu`
+  - `mgsm_direct_eu`
+  - `mgsm_native_cot_eu`
+  - `qnlieu`
+  - `wnli_eu`
+  - `xcopa_eu`
+  - `xnli_eu`
+  - `xnli_eu_native`
+  - `xstorycloze_eu`
+Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
+- `belebele_eus_Latn`: Belebele Basque
+- `qnlieu`: From BasqueGLUE
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/basque_bench/basque_bench.yaml
+++ b/lm_eval/tasks/basque_bench/basque_bench.yaml
+group: basque_bench
+task:
+    - belebele_eus_Latn
+    - xstorycloze_eu
+    - flores_eu
+    - eus_reading
+    - eus_proficiency
+    - eus_trivia
+    - eus_exams_eu
+    - qnlieu
+    - xnli_eu
+    - xnli_eu_native
+    - wnli_eu
+    - xcopa_eu
+    - mgsm_direct_eu
+    - mgsm_native_cot_eu
+metadata:
+  version: 1.0