Merge branch 'main' into llama

bf11ac93 · Baber · 83b1c564 · ade01428 · bf11ac93 · bf11ac93
Commit bf11ac93 authored Mar 03, 2025 by Baber
20 changed files
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
 import copy
+import logging
 from typing import Dict, List, Optional, Tuple, Union
 import torch
@@ -7,7 +8,6 @@ import transformers
 from tqdm import tqdm
 from transformers import BatchEncoding
-from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
@@ -24,7 +24,7 @@ from lm_eval.models.utils import (
 DEFAULT_IMAGE_PLACEHOLDER = "<image>"
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("hf-multimodal")

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import copy
+import logging
 import os
 from datetime import timedelta
 from pathlib import Path
@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
 )
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("hf-auto", "hf", "huggingface")

--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
 import copy
 import json
+import logging
 import os
 from functools import lru_cache
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import JsonChatStr
-from lm_eval.utils import eval_logger, simple_parse_args_string
+from lm_eval.utils import simple_parse_args_string
+eval_logger = logging.getLogger(__name__)
 class LogLikelihoodResult(NamedTuple):

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import importlib
+import logging
 import pathlib
 from copy import deepcopy
 from typing import List, Literal
@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
    simple_parse_args_string,
 )
+eval_logger = logging.getLogger(__name__)
 def _patch_pretrained_cfg(
    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
 ):

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
 import copy
+import logging
 from typing import List, Optional, Tuple, Union
 import numpy
@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("sparseml")

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
+import logging
 import os
 from functools import cached_property
 from operator import itemgetter
@@ -6,7 +7,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import TemplateAPI
 from lm_eval.models.utils import handle_stop_sequences
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("local-completions")
@@ -288,4 +291,6 @@ class OpenAIChatCompletion(LocalChatCompletion):
        if "o1" in self.model:
            output.pop("stop")
            output["temperature"] = 1
+        elif "o3" in self.model:
+            output.pop("temperature")
        return output
--- a/lm_eval/models/optimum_ipex.py
+++ b/lm_eval/models/optimum_ipex.py
+import logging
 from importlib.util import find_spec
-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
 from lm_eval.models.utils import get_dtype
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("ipex")

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
 import json
+import logging
 from importlib.util import find_spec
 from pathlib import Path
-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("openvino")

--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
+import copy
+import logging
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from tqdm import tqdm
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import (
+    Collator,
+    handle_stop_sequences,
+)
+from lm_eval.utils import (
+    get_rolling_token_windows,
+    make_disjoint_window,
+)
+eval_logger = logging.getLogger(__name__)
+try:
+    import sglang as sgl
+except ModuleNotFoundError:
+    pass
+if TYPE_CHECKING:
+    pass
+@register_model("sglang")
+class SGLangLM(TemplateLM):
+    _DEFAULT_MAX_LENGTH = 2048
+    def __init__(
+        self,
+        pretrained: str,
+        # batch args from lm-eval interface:  https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md
+        batch_size: Union[str, int] = 1,
+        max_batch_size=None,
+        max_model_len: int = None,
+        max_gen_toks: int = 256,
+        add_bos_token: Optional[bool] = False,
+        ########## SGlang native args ##########
+        # Todo(Jinwei): Include more args of SGLang Engine if needed. Refer to https://docs.sglang.ai/backend/server_arguments.html .
+        tokenizer_path: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        load_format: str = "auto",
+        trust_remote_code: bool = True,
+        dtype: str = "auto",
+        kv_cache_dtype: str = "auto",
+        context_length: Optional[int] = None,
+        device: str = "cuda",
+        chunked_prefill_size: int = -1,
+        # Memory and scheduling
+        mem_fraction_static: Optional[float] = None,
+        # parallelism
+        dp_size: int = 1,
+        tp_size: int = 1,
+        prefix_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        if not find_spec("sglang"):
+            raise ModuleNotFoundError(
+                "attempted to use 'sglang' LM type, but package `sglang` is not installed. "
+                "Please install sglang via official document here:https://docs.sglang.ai/start/install.html#install-sglang"
+            )
+        assert "cuda" in device or device is None, "SGLang only supports CUDA"
+        assert context_length is None or max_model_len is None, (
+            "Either context_length or max_model_len may be provided, but not both"
+        )
+        # Initialize your sglang model here
+        self._max_length = (
+            max_model_len if max_model_len is not None else context_length
+        )
+        self.tensor_parallel_size = int(tp_size)
+        self.data_parallel_size = int(dp_size)
+        self.model_args = {
+            "model_path": pretrained,
+            "tokenizer_path": tokenizer_path,
+            "tokenizer_mode": tokenizer_mode,
+            "load_format": load_format,
+            "trust_remote_code": trust_remote_code,
+            "dtype": dtype,
+            "kv_cache_dtype": kv_cache_dtype,
+            "device": device,
+            "mem_fraction_static": mem_fraction_static,
+            "tp_size": self.tensor_parallel_size,
+            "dp_size": self.data_parallel_size,
+            "chunked_prefill_size": chunked_prefill_size,
+        }
+        self.model_args.update(kwargs)
+        self.batch_size = (
+            "auto"
+            if isinstance(batch_size, str) and "auto" in batch_size
+            else int(batch_size)
+        )
+        if self.data_parallel_size > 1:
+            eval_logger.warning(
+                "Data parallelism will be deprecated in the future version of SGLang. See here: https://docs.sglang.ai/backend/server_arguments.html#data-parallelism ."
+            )
+        self.model = sgl.Engine(**self.model_args)
+        # Todo(Jinwei): check tokenizer and other settings.
+        self.tokenizer = self.model.tokenizer_manager.tokenizer
+        self._max_gen_toks = max_gen_toks
+        self.add_bos_token = add_bos_token
+        if "gemma" in pretrained.lower():
+            self.add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
+            )
+        self.custom_prefix_token_id = prefix_token_id
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            adaptive_batch_size = len(requests)
+        # First, collect all windows from all requests
+        all_windows = []  # List of (request_idx, window) tuples
+        request_window_counts = []  # Track number of windows per request
+        for req_idx, (string,) in enumerate(
+            tqdm(
+                [req.args for req in requests],
+                disable=(disable_tqdm or (self.rank != 0)),
+            )
+        ):
+            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+                map(
+                    make_disjoint_window,
+                    get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        # max_seq_len - (1 for context)
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            windows = [(None,) + x for x in rolling_token_windows]
+            # Store windows with their request index
+            all_windows.extend((req_idx, window) for window in windows)
+            request_window_counts.append(len(windows))
+        all_nlls = []
+        batch_size = adaptive_batch_size or int(self.batch_size)
+        for i in range(0, len(all_windows), batch_size):
+            batch = all_windows[i : i + batch_size]
+            # Extract just the windows for processing, keeping track of request indices
+            batch_indices, batch_windows = zip(*batch)
+            batch_nlls = self._loglikelihood_tokens(
+                requests=batch_windows,
+                disable_tqdm=False,
+            )
+            # Store results with their request indices
+            all_nlls.extend(zip(batch_indices, batch_nlls))
+        # Reconstruct per-request loglikelihoods
+        loglikelihoods = []
+        current_idx = 0
+        for window_count in request_window_counts:
+            # Get all nlls for this request
+            request_nlls = all_nlls[current_idx : current_idx + window_count]
+            # Sum up the nlls for this request (discarding is_greedy)
+            request_total = sum(nll[0] for _, nll in request_nlls)
+            loglikelihoods.append(request_total)
+            current_idx += window_count
+            string = requests[len(loglikelihoods) - 1].args[0]
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling", (string,), request_total
+            )
+        return loglikelihoods
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+        # batch tokenize contexts
+        context, all_gen_kwargs = zip(*(req.args for req in requests))
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
+        requests = [
+            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
+        ]
+        def _collate_gen(_requests):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            return -len(_requests[0][1]), _requests[0][0]
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
+        chunks = re_ords.get_batched(
+            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
+        )
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests",
+        )
+        # for each different set of kwargs, we execute all requests, by batch.
+        eos = self.tokenizer.decode(self.eot_token_id)
+        for chunk in chunks:
+            context_and_encoding, all_gen_kwargs = zip(*chunk)
+            context, context_encoding = zip(*context_and_encoding)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                # add EOS token to stop sequences
+                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+            # set the max length in tokens of inputs ("context_enc")
+            # max len for inputs = max length, minus room to generate the max new tokens
+            max_ctx_len = self.max_length - max_gen_toks
+            context_encoding = [x[-max_ctx_len:] for x in context_encoding]
+            # perform batched generation
+            # cont is a list of dic. See here https://github.com/sgl-project/sglang/blob/0a6f18f068e4095fc228e798454e8496c9749214/python/sglang/srt/entrypoints/engine.py#L111 .
+            cont = self._model_generate(
+                requests=context_encoding,
+                generate=True,
+                max_tokens=max_gen_toks,
+                stop=until,
+                **kwargs,
+            )
+            # cache generations
+            for output, context in zip(cont, context):
+                generated_text = output.get("text", "")
+                res.append(generated_text)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), generated_text
+                )
+                pbar.update(1)
+        pbar.close()
+        # reorder all group of results back to original unsorted form
+        return re_ords.get_original(res)
+    def _model_generate(
+        self,
+        requests: List[List[int]] = None,
+        generate: bool = False,
+        max_tokens: int = None,
+        stop: Optional[List[str]] = None,
+        return_logprob: bool = False,
+        top_logprobs_num: int = 1,
+        logprob_start_len: int = -1,
+        **kwargs,
+    ):
+        # check sglang sampling parameters: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/sampling/sampling_params.py#L21  and https://docs.sglang.ai/references/sampling_params.html.
+        if generate:
+            kwargs = self.modify_gen_kwargs(kwargs)
+            sampling_params = {
+                "max_new_tokens": max_tokens,
+                "stop": stop,
+            }
+            sampling_params.update(kwargs)
+        else:
+            sampling_params = {
+                "temperature": 0,
+                "max_new_tokens": 1,
+            }
+            sampling_params.update(kwargs)
+        # Refer to:  https://docs.sglang.ai/backend/offline_engine_api.html
+        outputs = self.model.generate(
+            input_ids=requests,
+            sampling_params=sampling_params,
+            return_logprob=return_logprob,
+            top_logprobs_num=top_logprobs_num,
+            logprob_start_len=logprob_start_len,
+        )
+        return outputs
+    @property
+    def eot_token_id(self):
+        # Return the EOT (End of Text) token ID
+        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        if hasattr(self.model, "tokenizer_manager") and hasattr(
+            self.model.tokenizer_manager, "context_len"
+        ):
+            return self.model.tokenizer_manager.context_len
+        return self._DEFAULT_MAX_LENGTH
+    @property
+    def max_gen_toks(self):
+        # Return the maximum number of tokens for generation
+        return self._max_gen_toks
+    def tok_encode(
+        self,
+        string: Union[str, List[str]],
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+    ) -> Union[List[int], List[List[int]]]:
+        if not add_special_tokens:
+            add_special_tokens = False or self.add_bos_token
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            string,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]
+        return encoding
+    def tok_decode(self, tokens: List[int]) -> str:
+        # Implement token-to-text decoding
+        pass
+    @property
+    def tokenizer_name(self) -> str:
+        """
+        Return the name of the model's tokenizer and/or the accompanying chat template.
+        The returned string is used to cache requests.
+        Returns:
+            str: The name of the model's tokenizer and/or chat template.
+        """
+        pass
+    def chat_template(self, chat_template: Union[bool, str] = False) -> str:
+        """
+        Get the appropriate chat template for the model based on the `chat_template` argument.
+        This method returns the chat template string to build the prompt from a chat history.
+        The chat template is saved in the evaluation results for reproducibility.
+        Boolean arguments should be used with models that have only one chat template,
+        while string arguments are used with models that have multiple chat templates.
+        For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.
+        Args:
+            chat_template (Union[bool, str]): Specifies whether to apply a chat template:
+                - If False: Do not apply any chat template.
+                - If True: Apply the default chat template.
+                - If str: Apply the specified chat template by name.
+        Returns:
+            str: The selected chat template in Jinja format.
+        """
+        pass
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        chat_templated = self.tokenizer.apply_chat_template(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
+        )
+        return chat_templated
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+        # Reorder requests by length and batch
+        re_ord = Collator(requests, sort_fn=_collate)
+        chunks = re_ord.get_batched(
+            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
+        )
+        pbar = tqdm(
+            total=len(requests),
+            disable=disable_tqdm,
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inputs = []
+            ctxlens = []
+            for cache_key, context_enc, continuation_enc in chunk:
+                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                )
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+            outputs = self._model_generate(
+                requests=inputs,
+                generate=False,
+                return_logprob=True,
+                top_logprobs_num=2,
+                logprob_start_len=0,
+            )
+            for output, ctxlen, (cache_key, _, _), inp in zip(
+                outputs, ctxlens, chunk, inputs
+            ):
+                answer = self._parse_logprobs(
+                    tokens=inp,
+                    outputs=output,
+                    ctxlen=ctxlen,
+                )
+                res.append(answer)
+                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+    @staticmethod
+    def _parse_logprobs(tokens: List, outputs, ctxlen: int) -> Tuple[float, bool]:
+        """Process logprobs and tokens.
+        :param tokens: list
+            Input tokens (potentially left-truncated)
+        :param outputs:
+            Contains input_token_logprobs and input_top_logprobs
+        :param ctxlen: int
+            Length of context (so we can slice them away and only keep the predictions)
+        :return:
+            continuation_logprobs: float
+                Log probabilities of continuation tokens
+            is_greedy: bool
+                Whether argmax matches given continuation exactly
+        """
+        # The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
+        # [(logprob, token_id, token_text)]
+        continuation_logprobs_lists = outputs["meta_info"]["input_token_logprobs"]
+        continuation_logprobs = sum(
+            logprob for logprob, _, _ in continuation_logprobs_lists[ctxlen:]
+        )
+        top_logprobs_lists = outputs["meta_info"]["input_top_logprobs"]
+        # Determine if is_greedy
+        is_greedy = True
+        for token, top_logprobs in zip(tokens[ctxlen:], top_logprobs_lists[ctxlen:]):
+            if top_logprobs:
+                top_token = max(top_logprobs, key=lambda x: x[0])[1]
+                if top_token != token:
+                    is_greedy = False
+                    break
+        return continuation_logprobs, is_greedy
+    @staticmethod
+    def modify_gen_kwargs(kwargs: dict) -> dict:
+        # sampling_params
+        do_sample = kwargs.pop("do_sample", None)
+        if do_sample is False and "temperature" not in kwargs:
+            eval_logger.debug(
+                "Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
+            )
+            kwargs["temperature"] = 0.0
+        # hf defaults
+        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
+        kwargs["spaces_between_special_tokens"] = kwargs.get(
+            "spaces_between_special_tokens", False
+        )
+        return kwargs
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -2,6 +2,7 @@ import collections
 import fnmatch
 import gc
 import itertools
+import logging
 import time
 from functools import wraps
 from typing import (
@@ -22,7 +23,8 @@ from typing import (
 import torch
 import transformers
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 if TYPE_CHECKING:

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
+import logging
 from importlib.metadata import version
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
@@ -17,7 +18,6 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
 )
@@ -34,7 +34,7 @@ except ModuleNotFoundError:
 if TYPE_CHECKING:
    pass
-eval_logger = eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("vllm")
@@ -75,7 +75,6 @@ class VLLM(TemplateLM):
                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
            )
-        assert "cuda" in device or device is None, "vLLM only supports CUDA"
        assert max_length is None or max_model_len is None, (
            "Either max_length or max_model_len may be provided, but not both"
        )
@@ -110,7 +109,7 @@ class VLLM(TemplateLM):
            eval_logger.warning(
                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
            )
-            self.model_args["worker_use_ray"] = True
+            self.model_args["distributed_executor_backend"] = "ray"
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")
@@ -244,15 +243,13 @@ class VLLM(TemplateLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
-                sampling_params,
+                sampling_params: SamplingParams,
                requests: List[List[int]],
                lora_request: LoRARequest,
            ):

--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
 import copy
+import logging
 from typing import Dict, List, Optional
 import transformers
@@ -14,7 +15,9 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.models.vllm_causallms import VLLM
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 try:
@@ -106,11 +109,9 @@ class VLLM_VLM(VLLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict, sampling_params, requests: List[List[dict]]

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
 import ast
+import logging
 import os
 from typing import Dict
 from lm_eval import utils
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 # Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -5,137 +5,141 @@
 For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
-| Task Family | Description | Language(s)                                                                                                                   |
+| Task Family                                                              | Description | Language(s)                                                                                                                   |
-|-------------|-------------|-------------------------------------------------------------------------------------------------------------------------------|
+|--------------------------------------------------------------------------|-------------|-------------------------------------------------------------------------------------------------------------------------------|
-| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese                                                                                                               |
+| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese                                                                                                               |
-| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
+| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
-| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese                                                                                                              |
+| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts. | English, Chinese                                                                                                              |
-| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English                                                                                                                       |
+| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness. | English                                                                                                                       |
-| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
+| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
-| [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
+| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
-| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic                                                                                                                        |
+| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic                                                                                                                        |
-| [AraDICE](aradice/README.md) | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic                                                                                                                        |
+| [AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic                                                                                                                        |
-| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English                                                                                                                       |
+| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.  | English                                                                                                                       |
-| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English                                                                                                                       |
+| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning. | English                                                                                                                       |
-| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English                                                                                                                       |
+| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges. | English                                                                                                                       |
-| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English                                                                                                                       |
+| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories. | English                                                                                                                       |
-| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque                                                                                                                        |
+| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas. | Basque                                                                                                                        |
-| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque                                                                                                                        |
+| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language. | Basque                                                                                                                        |
-| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German                                                                                                               |
+| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German                                                                                                               |
-| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages)                                                                                                      |
+| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages)                                                                                                      |
-| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. |                                                                                                                               |
+| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities. |                                                                                                                               |
-| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT)                                                                                                  |
+| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT)                                                                                                  |
-| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple                                                                                                                      |
+| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple                                                                                                                      |
-| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English                                                                                                                       |
+| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English                                                                                                                       |
-| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan                                                                                                                       |
+| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan                                                                                                                       |
-| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese                                                                                                                       |
+| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese                                                                                                                       |
-| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese                                                                                                                       |
+| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese                                                                                                                       |
-| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby                                                                                               |
+| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby                                                                                               |
-| [commonsense_qa](commonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English                                                                                                                       |
+| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English                                                                                                                       |
-| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian                                                                                                                    |
+| [copal_id](copal_id/README.md)                                           | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian                                                                                                                    |
-| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English                                                                                                                       |
+| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding. | English                                                                                                                       |
-| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French                                                                                                               |
+| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups. | English, French                                                                                                               |
-| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean                                                                                                                        |
+| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean                                                                                                                        |
-| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English                                                                                                                       |
+| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English                                                                                                                       |
-| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English                                                                                                                       |
+| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making. | English                                                                                                                       |
-| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque                                                                                                                        |
+| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language. | Basque                                                                                                                        |
-| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque                                                                                                                        |
+| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics. | Basque                                                                                                                        |
-| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque                                                                                                                        |
+| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language. | Basque                                                                                                                        |
-| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque                                                                                                                        |
+| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language. | Basque                                                                                                                        |
-| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English                                                                                                                       |
+| [evalita-LLM](evalita-LLM/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian                                                                                                                      |
-| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English                                                                                                                       |
+| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English                                                                                                                       |
-| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French                                                                                                                        |
+| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding. | English                                                                                                                       |
-| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician                                                                                                                      |
+| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French. | French                                                                                                                        |
-| [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages)                                                                                                       |
+| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas. | Galician                                                                                                                      |
-| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English                                                                                                                       |
+| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages)                                                                                                       |
-| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English                                                                                                                       |
+| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities. | English                                                                                                                       |
-| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English                                                                                                                       |
+| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification. | English                                                                                                                       |
-| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean                                                                                                                        |
+| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English                                                                                                                       |
-| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English                                                                                                              |
+| [groundcocoa](groundcocoa/README.md)                                           | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English                                                                                                                       |
-| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English                                                                                                                       |
+| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge. | Korean                                                                                                                        |
-| [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English                                                                                                                       |
+| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English                                                                                                              |
-| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English                                                                                                                       |
+| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English                                                                                                                       |
-| [hrm8k](hrm8k/README.md) | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT)                                                                                           |
+| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models. | English                                                                                                                       |
-| [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python                                                                                                                        |
+| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English                                                                                                                       |
-| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English                                                                                                                       |
+| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.  | French (Some MT)                                                                                                                        |
-| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English                                                                                                                       |
+| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT)                                                                                           |
-| [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese                                                                                                                      |
+| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python                                                                                                                        |
-| [kbl](kbl/README.md) | Korean Benchmark for Legal Language Understanding. | Korean                                                                                                                        |
+| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English                                                                                                                       |
-| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean                                                                                                                        |
+| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English                                                                                                                       |
-| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean                                                                                                                        |
+| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese                                                                                                                      |
-| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean                                                                                                                        |
+| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding. | Korean                                                                                                                        |
-| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English                                                                                                                       |
+| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean                                                                                                                        |
-| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English                                                                                                                       |
+| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language. | Korean                                                                                                                        |
-| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian                                                                                     |
+| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean                                                                                                                        |
+| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills. | English                                                                                                                       |
+| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset. | English                                                                                                                       |
+| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian                                                                                     |
 | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese                                                                  |
-| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English                                                                                                                       |
+| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English                                                                                                                       |
-| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual                                                                                                         |
+| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual                                                                                                         |
-| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese                                                                                                              |
+| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese                                                                                                              |
-| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese                                                                                                              |
+| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese                                                                                                              |
-| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English                                                                                                                       |
+| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving. | English                                                                                                                       |
-| [mbpp](mbpp/README.md) | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions. | Python |
+| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions. | Python                                                                                                                        |
-| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English                                                                                                                       |
+| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension. | English                                                                                                                       |
-| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English                                                                                                                       |
+| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English                                                                                                                       |
-| [metabench](metabench/README.md) | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English                                                                                                                       |
+| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English                                                                                                                       |
-| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English                                                                                                                       |
+| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge. | English                                                                                                                       |
-| medqa | Multiple choice question answering based on the United States Medical License Exams. |                                                                                                                               |
+| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams. |                                                                                                                               |
-| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                           |
+| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                           |
-| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English                                                                                                                       |
+| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English                                                                                                                       |
-| [mlqa](mlqa/README.md) | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese |
+| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
-| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English                                                                                                                       |
+| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English                                                                                                                       |
-| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English                                                                                                                       |
+| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English                                                                                                                       |
-| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English                                                                                                                       |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                   | English |
-| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. |                                                                                                                               |
+| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous. | English                                                                                                                       |
-| [moral_stories](moral_stories/README.md) | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English  
+| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. |                                                                                                                               |
-| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English                                                                                                                       |
+| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English  
-| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English                                                                                                                       |
+| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning. | English                                                                                                                       |
-| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.**                                                                               |
+| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset. | English                                                                                                                       |
-| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.**                                                                               |
+| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.**                                                                               |
-| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.**                                                                               |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.**                                                                               |
+| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.**                                                                               |
 | [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.**                                                                               |
-| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English                                                                                                                       |
+| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning. | English                                                                                                                       |
-| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English                                                                                                                       |
+| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English                                                                                                                       |
-| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean                                                                   |
+| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean                                                                   |
-| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English                                                                                                                       |
+| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English                                                                                                                       |
-| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English                                                                                                                       |
+| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it. | English                                                                                                                       |
-| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English                                                                                                                       |
+| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English                                                                                                                       |
-| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish                                                                                                                        |
+| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish                                                                                                                        |
-| [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese                                                                                                                    |
+| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese                                                                                                                    |
-| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English                                                                                                                       |
+| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains. | English                                                                                                                       |
-| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English                                                                                                                       |
+| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding. | English                                                                                                                       |
-| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English                                                                                                                       |
+| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English                                                                                                                       |
-| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English                                                                                                                       |
+| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English                                                                                                                       |
-| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English                                                                                                                       |
+| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China. | English                                                                                                                       |
-| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. |                                                                                                                               |
+| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity. |                                                                                                                               |
-| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English                                                                                                                       |
+| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts. | English                                                                                                                       |
-| [score](score/README.md) | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English                                                                                                                       |
+| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English                                                                                                                       |
-| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English                                                                                                                       |
+| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains. | English                                                                                                                       |
-| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English                                                                                                                       |
+| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English                                                                                                                       |
-| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish                                                                                                                       |
+| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish                                                                                                                       |
-| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English                                                                                                                       |
+| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English                                                                                                                       |
-| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English                                                                                                                       |
+| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English                                                                                                                       |
-| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English                                                                                                                       |
+| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence. | English                                                                                                                       |
-| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English                                                                                                                       |
+| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills. | English                                                                                                                       |
-| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English                                                                                                                       |
+| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos. | English                                                                                                                       |
-| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English                                                                                                                       |
+| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages. | English                                                                                                                       |
-| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English                                                                                                                       |
+| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English                                                                                                                       |
-| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese                                                                                                           |
+| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese                                                                                                           |
-| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English                                                                                                                       |
+| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content. | English                                                                                                                       |
-| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                               |
+| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                               |
-| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English                                                                                                                       |
+| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge. | English                                                                                                                       |
-| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English                                                                                                                       |
+| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English                                                                                                                       |
-| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish                                                                                                                       |
+| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish                                                                                                                       |
-| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English                                                                                                                       |
+| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English                                                                                                                       |
-| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English                                                                                                                       |
+| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English                                                                                                                       |
-| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English                                                                                                                       |
+| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval. | English                                                                                                                       |
-| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English                                                                                                                       |
+| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English                                                                                                                       |
-| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English                                                                                                                       |
+| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English                                                                                                                       |
-| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English                                                                                                                       |
+| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English                                                                                                                       |
-| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                                   |
+| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                                   |
-| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English                                                                                                                       |
+| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English                                                                                                                       |
-| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                           |
+| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                           |
-| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
+| [xnli](xnli/README.md)                                                   | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
-| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque                                                                                                                        |
+| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque. | Basque                                                                                                                        |
-| [xquad](xquad/README.md) | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                         |
+| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                         |
-| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                             |
+| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                             |
-| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese                                                                       |
+| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese                                                                       |
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -13,6 +13,8 @@ from lm_eval.evaluator_utils import get_subtask_list
 GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
+eval_logger = logging.getLogger(__name__)
 class TaskManager:
    """TaskManager indexes all tasks from the default `lm_eval/tasks/`
@@ -22,14 +24,13 @@ class TaskManager:
    def __init__(
        self,
-        verbosity="INFO",
+        verbosity: Optional[str] = None,
        include_path: Optional[Union[str, List]] = None,
        include_defaults: bool = True,
    ) -> None:
-        self.verbosity = verbosity
+        if verbosity is not None:
+            utils.setup_logging(verbosity)
        self.include_path = include_path
-        self.logger = utils.eval_logger
-        self.logger.setLevel(getattr(logging, f"{verbosity}"))
        self._task_index = self.initialize_tasks(
            include_path=include_path, include_defaults=include_defaults
@@ -456,7 +457,7 @@ class TaskManager:
                            "yaml_path": -1,
                        }
                    elif tasks_and_groups[tag]["type"] != "tag":
-                        self.logger.info(
+                        eval_logger.info(
                            f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
                            "This may affect tasks you want to call."
                        )
@@ -519,7 +520,7 @@ class TaskManager:
                            config, task, tasks_and_groups, print_info
                        )
                    else:
-                        self.logger.debug(f"File {f} in {root} could not be loaded")
+                        eval_logger.debug(f"File {f} in {root} could not be loaded")
        return tasks_and_groups

--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -10,7 +10,7 @@ import yaml
 from tqdm import tqdm
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 SUBJECTS = {

--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
@@ -58,3 +58,6 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+### Changelog
+version 2.0: (2025-Feb-14) set target delimiter to "" as the targets already start with a space.
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -8,11 +8,12 @@ validation_split: validation
 test_split: null
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
+target_delimiter: ""
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 1.0
+  version: 2.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/README.md
+++ b/lm_eval/tasks/basque_bench/README.md
@@ -5,14 +5,16 @@
 BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
 The new evaluation datasets included in BasqueBench are:
-| Task          | Category       | Homepage  |
+| Task     | Category                   | Homepage                                      |
-|:-------------:|:-----:|:-----:|
+|:--------:|:--------------------------:|:---------------------------------------------:|
-| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
+| ARC_eu   | Question Answering         | https://huggingface.co/datasets/HiTZ/ARC-eu   |
-| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
+| MGSM_eu  | Math                       | https://huggingface.co/datasets/HiTZ/MGSM-eu  |
-| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
+| PAWS_eu  | Paraphrasing               | https://huggingface.co/datasets/HiTZ/PAWS-eu  |
-| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
+| PIQA_eu  | Question Answering         | https://huggingface.co/datasets/HiTZ/PIQA-eu  |
+| WNLI_eu  | Natural Language Inference | https://huggingface.co/datasets/HiTZ/WNLI-eu  |
+| XCOPA_eu | Commonsense Reasoning      | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
-The datasets included in BasqueBench that have been made public in previous pubications are:
+The datasets included in BasqueBench that have been made public in previous publications are:
 | Task          | Category       | Paper title          | Homepage  |
 |:-------------:|:-----:|:-------------:|:-----:|
@@ -28,7 +30,40 @@ The datasets included in BasqueBench that have been made public in previous pubi
 ### Citation
-Paper for BasqueBench coming soon.
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
+    author = "Baucells, Irene  and
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
+}
+```
 ### Groups and Tasks
@@ -40,6 +75,8 @@ Paper for BasqueBench coming soon.
 #### Tasks
 The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
+  - `arc_eu_challenge`
+  - `arc_eu_easy`
  - `belebele_eus_Latn`
  - `eus_exams_eu`
  - `eus_proficiency`
@@ -64,6 +101,7 @@ The following tasks evaluate tasks on BasqueBench dataset using various scoring
  - `flores_pt-eu`
  - `mgsm_direct_eu`
  - `mgsm_native_cot_eu`
+  - `paws_eu`
  - `piqa_eu`
  - `qnlieu`
  - `wnli_eu`

--- a/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+++ b/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+include: arc_eu_easy.yaml
+task: arc_eu_challenge
+dataset_name: ARC-Challenge