Merge branch 'main' into convert_gen

# Conflicts: # lm_eval/__main__.py # lm_eval/evaluator.py

Merge branch 'main' into convert_gen
# Conflicts: # lm_eval/__main__.py # lm_eval/evaluator.py
efb46937 · Baber · 7fbf899c · ade01428 · efb46937 · efb46937
Commit efb46937 authored Mar 03, 2025 by Baber
20 changed files
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import copy
+import logging
 import os
 from datetime import timedelta
 from pathlib import Path
@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
 )


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("hf-auto", "hf", "huggingface")

--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
 import copy
 import json
+import logging
 import os
 from functools import lru_cache
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import JsonChatStr
-from lm_eval.utils import eval_logger, simple_parse_args_string
+from lm_eval.utils import simple_parse_args_string
+
+
+eval_logger = logging.getLogger(__name__)


 class LogLikelihoodResult(NamedTuple):

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 import importlib
+import logging
 import pathlib
 from copy import deepcopy
 from typing import List, Literal
@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
    simple_parse_args_string,
 )


+eval_logger = logging.getLogger(__name__)
+
+
 def _patch_pretrained_cfg(
    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
 ):

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
 import copy
+import logging
 from typing import List, Optional, Tuple, Union

 import numpy
@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("sparseml")

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
+import logging
 import os
 from functools import cached_property
 from operator import itemgetter
@@ -6,7 +7,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import TemplateAPI
 from lm_eval.models.utils import handle_stop_sequences
-from lm_eval.utils import eval_logger
+
+
+eval_logger = logging.getLogger(__name__)


 @register_model("local-completions")
@@ -288,4 +291,6 @@ class OpenAIChatCompletion(LocalChatCompletion):
        if "o1" in self.model:
            output.pop("stop")
            output["temperature"] = 1
+        elif "o3" in self.model:
+            output.pop("temperature")
        return output
--- a/lm_eval/models/optimum_ipex.py
+++ b/lm_eval/models/optimum_ipex.py
+import logging
 from importlib.util import find_spec

-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
 from lm_eval.models.utils import get_dtype


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("ipex")

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
 import json
+import logging
 from importlib.util import find_spec
 from pathlib import Path

-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("openvino")

--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
+import copy
+import logging
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import (
+    Collator,
+    handle_stop_sequences,
+)
+from lm_eval.utils import (
+    get_rolling_token_windows,
+    make_disjoint_window,
+)
+
+
+eval_logger = logging.getLogger(__name__)
+
+try:
+    import sglang as sgl
+except ModuleNotFoundError:
+    pass
+
+if TYPE_CHECKING:
+    pass
+
+
+@register_model("sglang")
+class SGLangLM(TemplateLM):
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        pretrained: str,
+        # batch args from lm-eval interface:  https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md
+        batch_size: Union[str, int] = 1,
+        max_batch_size=None,
+        max_model_len: int = None,
+        max_gen_toks: int = 256,
+        add_bos_token: Optional[bool] = False,
+        ########## SGlang native args ##########
+        # Todo(Jinwei): Include more args of SGLang Engine if needed. Refer to https://docs.sglang.ai/backend/server_arguments.html .
+        tokenizer_path: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        load_format: str = "auto",
+        trust_remote_code: bool = True,
+        dtype: str = "auto",
+        kv_cache_dtype: str = "auto",
+        context_length: Optional[int] = None,
+        device: str = "cuda",
+        chunked_prefill_size: int = -1,
+        # Memory and scheduling
+        mem_fraction_static: Optional[float] = None,
+        # parallelism
+        dp_size: int = 1,
+        tp_size: int = 1,
+        prefix_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if not find_spec("sglang"):
+            raise ModuleNotFoundError(
+                "attempted to use 'sglang' LM type, but package `sglang` is not installed. "
+                "Please install sglang via official document here:https://docs.sglang.ai/start/install.html#install-sglang"
+            )
+
+        assert "cuda" in device or device is None, "SGLang only supports CUDA"
+        assert context_length is None or max_model_len is None, (
+            "Either context_length or max_model_len may be provided, but not both"
+        )
+        # Initialize your sglang model here
+        self._max_length = (
+            max_model_len if max_model_len is not None else context_length
+        )
+        self.tensor_parallel_size = int(tp_size)
+        self.data_parallel_size = int(dp_size)
+        self.model_args = {
+            "model_path": pretrained,
+            "tokenizer_path": tokenizer_path,
+            "tokenizer_mode": tokenizer_mode,
+            "load_format": load_format,
+            "trust_remote_code": trust_remote_code,
+            "dtype": dtype,
+            "kv_cache_dtype": kv_cache_dtype,
+            "device": device,
+            "mem_fraction_static": mem_fraction_static,
+            "tp_size": self.tensor_parallel_size,
+            "dp_size": self.data_parallel_size,
+            "chunked_prefill_size": chunked_prefill_size,
+        }
+
+        self.model_args.update(kwargs)
+        self.batch_size = (
+            "auto"
+            if isinstance(batch_size, str) and "auto" in batch_size
+            else int(batch_size)
+        )
+        if self.data_parallel_size > 1:
+            eval_logger.warning(
+                "Data parallelism will be deprecated in the future version of SGLang. See here: https://docs.sglang.ai/backend/server_arguments.html#data-parallelism ."
+            )
+        self.model = sgl.Engine(**self.model_args)
+
+        # Todo(Jinwei): check tokenizer and other settings.
+        self.tokenizer = self.model.tokenizer_manager.tokenizer
+        self._max_gen_toks = max_gen_toks
+        self.add_bos_token = add_bos_token
+        if "gemma" in pretrained.lower():
+            self.add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
+            )
+        self.custom_prefix_token_id = prefix_token_id
+
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            adaptive_batch_size = len(requests)
+
+        # First, collect all windows from all requests
+        all_windows = []  # List of (request_idx, window) tuples
+        request_window_counts = []  # Track number of windows per request
+
+        for req_idx, (string,) in enumerate(
+            tqdm(
+                [req.args for req in requests],
+                disable=(disable_tqdm or (self.rank != 0)),
+            )
+        ):
+            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+                map(
+                    make_disjoint_window,
+                    get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        # max_seq_len - (1 for context)
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            windows = [(None,) + x for x in rolling_token_windows]
+
+            # Store windows with their request index
+            all_windows.extend((req_idx, window) for window in windows)
+            request_window_counts.append(len(windows))
+
+        all_nlls = []
+        batch_size = adaptive_batch_size or int(self.batch_size)
+        for i in range(0, len(all_windows), batch_size):
+            batch = all_windows[i : i + batch_size]
+            # Extract just the windows for processing, keeping track of request indices
+            batch_indices, batch_windows = zip(*batch)
+
+            batch_nlls = self._loglikelihood_tokens(
+                requests=batch_windows,
+                disable_tqdm=False,
+            )
+            # Store results with their request indices
+            all_nlls.extend(zip(batch_indices, batch_nlls))
+
+        # Reconstruct per-request loglikelihoods
+        loglikelihoods = []
+        current_idx = 0
+        for window_count in request_window_counts:
+            # Get all nlls for this request
+            request_nlls = all_nlls[current_idx : current_idx + window_count]
+            # Sum up the nlls for this request (discarding is_greedy)
+            request_total = sum(nll[0] for _, nll in request_nlls)
+            loglikelihoods.append(request_total)
+            current_idx += window_count
+
+            string = requests[len(loglikelihoods) - 1].args[0]
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling", (string,), request_total
+            )
+
+        return loglikelihoods
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+
+        # batch tokenize contexts
+        context, all_gen_kwargs = zip(*(req.args for req in requests))
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
+        requests = [
+            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
+        ]
+
+        def _collate_gen(_requests):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            return -len(_requests[0][1]), _requests[0][0]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
+        chunks = re_ords.get_batched(
+            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
+        )
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests",
+        )
+        # for each different set of kwargs, we execute all requests, by batch.
+        eos = self.tokenizer.decode(self.eot_token_id)
+        for chunk in chunks:
+            context_and_encoding, all_gen_kwargs = zip(*chunk)
+            context, context_encoding = zip(*context_and_encoding)
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                # add EOS token to stop sequences
+                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            # set the max length in tokens of inputs ("context_enc")
+            # max len for inputs = max length, minus room to generate the max new tokens
+            max_ctx_len = self.max_length - max_gen_toks
+            context_encoding = [x[-max_ctx_len:] for x in context_encoding]
+
+            # perform batched generation
+            # cont is a list of dic. See here https://github.com/sgl-project/sglang/blob/0a6f18f068e4095fc228e798454e8496c9749214/python/sglang/srt/entrypoints/engine.py#L111 .
+            cont = self._model_generate(
+                requests=context_encoding,
+                generate=True,
+                max_tokens=max_gen_toks,
+                stop=until,
+                **kwargs,
+            )
+
+            # cache generations
+            for output, context in zip(cont, context):
+                generated_text = output.get("text", "")
+                res.append(generated_text)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), generated_text
+                )
+                pbar.update(1)
+
+        pbar.close()
+        # reorder all group of results back to original unsorted form
+        return re_ords.get_original(res)
+
+    def _model_generate(
+        self,
+        requests: List[List[int]] = None,
+        generate: bool = False,
+        max_tokens: int = None,
+        stop: Optional[List[str]] = None,
+        return_logprob: bool = False,
+        top_logprobs_num: int = 1,
+        logprob_start_len: int = -1,
+        **kwargs,
+    ):
+        # check sglang sampling parameters: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/sampling/sampling_params.py#L21  and https://docs.sglang.ai/references/sampling_params.html.
+        if generate:
+            kwargs = self.modify_gen_kwargs(kwargs)
+            sampling_params = {
+                "max_new_tokens": max_tokens,
+                "stop": stop,
+            }
+            sampling_params.update(kwargs)
+        else:
+            sampling_params = {
+                "temperature": 0,
+                "max_new_tokens": 1,
+            }
+            sampling_params.update(kwargs)
+
+        # Refer to:  https://docs.sglang.ai/backend/offline_engine_api.html
+        outputs = self.model.generate(
+            input_ids=requests,
+            sampling_params=sampling_params,
+            return_logprob=return_logprob,
+            top_logprobs_num=top_logprobs_num,
+            logprob_start_len=logprob_start_len,
+        )
+        return outputs
+
+    @property
+    def eot_token_id(self):
+        # Return the EOT (End of Text) token ID
+        return self.tokenizer.eos_token_id
+
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        if hasattr(self.model, "tokenizer_manager") and hasattr(
+            self.model.tokenizer_manager, "context_len"
+        ):
+            return self.model.tokenizer_manager.context_len
+        return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def max_gen_toks(self):
+        # Return the maximum number of tokens for generation
+        return self._max_gen_toks
+
+    def tok_encode(
+        self,
+        string: Union[str, List[str]],
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+    ) -> Union[List[int], List[List[int]]]:
+        if not add_special_tokens:
+            add_special_tokens = False or self.add_bos_token
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            string,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]
+
+        return encoding
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        # Implement token-to-text decoding
+        pass
+
+    @property
+    def tokenizer_name(self) -> str:
+        """
+        Return the name of the model's tokenizer and/or the accompanying chat template.
+        The returned string is used to cache requests.
+
+        Returns:
+            str: The name of the model's tokenizer and/or chat template.
+        """
+        pass
+
+    def chat_template(self, chat_template: Union[bool, str] = False) -> str:
+        """
+        Get the appropriate chat template for the model based on the `chat_template` argument.
+
+        This method returns the chat template string to build the prompt from a chat history.
+        The chat template is saved in the evaluation results for reproducibility.
+        Boolean arguments should be used with models that have only one chat template,
+        while string arguments are used with models that have multiple chat templates.
+        For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.
+
+        Args:
+            chat_template (Union[bool, str]): Specifies whether to apply a chat template:
+                - If False: Do not apply any chat template.
+                - If True: Apply the default chat template.
+                - If str: Apply the specified chat template by name.
+
+        Returns:
+            str: The selected chat template in Jinja format.
+        """
+        pass
+
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        chat_templated = self.tokenizer.apply_chat_template(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
+        )
+
+        return chat_templated
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        # Reorder requests by length and batch
+        re_ord = Collator(requests, sort_fn=_collate)
+        chunks = re_ord.get_batched(
+            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
+        )
+        pbar = tqdm(
+            total=len(requests),
+            disable=disable_tqdm,
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inputs = []
+            ctxlens = []
+            for cache_key, context_enc, continuation_enc in chunk:
+                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                )
+
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+
+            outputs = self._model_generate(
+                requests=inputs,
+                generate=False,
+                return_logprob=True,
+                top_logprobs_num=2,
+                logprob_start_len=0,
+            )
+            for output, ctxlen, (cache_key, _, _), inp in zip(
+                outputs, ctxlens, chunk, inputs
+            ):
+                answer = self._parse_logprobs(
+                    tokens=inp,
+                    outputs=output,
+                    ctxlen=ctxlen,
+                )
+                res.append(answer)
+
+                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+
+    @staticmethod
+    def _parse_logprobs(tokens: List, outputs, ctxlen: int) -> Tuple[float, bool]:
+        """Process logprobs and tokens.
+
+        :param tokens: list
+            Input tokens (potentially left-truncated)
+        :param outputs:
+            Contains input_token_logprobs and input_top_logprobs
+        :param ctxlen: int
+            Length of context (so we can slice them away and only keep the predictions)
+        :return:
+            continuation_logprobs: float
+                Log probabilities of continuation tokens
+            is_greedy: bool
+                Whether argmax matches given continuation exactly
+        """
+
+        # The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
+        # [(logprob, token_id, token_text)]
+        continuation_logprobs_lists = outputs["meta_info"]["input_token_logprobs"]
+        continuation_logprobs = sum(
+            logprob for logprob, _, _ in continuation_logprobs_lists[ctxlen:]
+        )
+
+        top_logprobs_lists = outputs["meta_info"]["input_top_logprobs"]
+
+        # Determine if is_greedy
+        is_greedy = True
+        for token, top_logprobs in zip(tokens[ctxlen:], top_logprobs_lists[ctxlen:]):
+            if top_logprobs:
+                top_token = max(top_logprobs, key=lambda x: x[0])[1]
+                if top_token != token:
+                    is_greedy = False
+                    break
+        return continuation_logprobs, is_greedy
+
+    @staticmethod
+    def modify_gen_kwargs(kwargs: dict) -> dict:
+        # sampling_params
+        do_sample = kwargs.pop("do_sample", None)
+        if do_sample is False and "temperature" not in kwargs:
+            eval_logger.debug(
+                "Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
+            )
+            kwargs["temperature"] = 0.0
+        # hf defaults
+        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
+        kwargs["spaces_between_special_tokens"] = kwargs.get(
+            "spaces_between_special_tokens", False
+        )
+        return kwargs
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -2,6 +2,7 @@ import collections
 import fnmatch
 import gc
 import itertools
+import logging
 import time
 from functools import wraps
 from typing import (
@@ -22,7 +23,8 @@ from typing import (
 import torch
 import transformers

-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)


 if TYPE_CHECKING:

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
+import logging
 from importlib.metadata import version
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
@@ -17,7 +18,6 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
 )
@@ -34,7 +34,7 @@ except ModuleNotFoundError:
 if TYPE_CHECKING:
    pass

-eval_logger = eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("vllm")
@@ -75,7 +75,6 @@ class VLLM(TemplateLM):
                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
            )

-        assert "cuda" in device or device is None, "vLLM only supports CUDA"
        assert max_length is None or max_model_len is None, (
            "Either max_length or max_model_len may be provided, but not both"
        )
@@ -110,7 +109,7 @@ class VLLM(TemplateLM):
            eval_logger.warning(
                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
            )
-            self.model_args["worker_use_ray"] = True
+            self.model_args["distributed_executor_backend"] = "ray"
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")

@@ -244,15 +243,13 @@ class VLLM(TemplateLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
-                sampling_params,
+                sampling_params: SamplingParams,
                requests: List[List[int]],
                lora_request: LoRARequest,
            ):

--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
 import copy
+import logging
 from typing import Dict, List, Optional

 import transformers
@@ -14,7 +15,9 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.models.vllm_causallms import VLLM
-from lm_eval.utils import eval_logger
+
+
+eval_logger = logging.getLogger(__name__)


 try:
@@ -106,11 +109,9 @@ class VLLM_VLM(VLLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict, sampling_params, requests: List[List[dict]]

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
 import ast
+import logging
 import os
 from typing import Dict

 from lm_eval import utils
-from lm_eval.utils import eval_logger


+eval_logger = logging.getLogger(__name__)
+
 # Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -42,6 +42,7 @@
 | [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics. | Basque                                                                                                                        |
 | [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language. | Basque                                                                                                                        |
 | [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language. | Basque                                                                                                                        |
+| [evalita-LLM](evalita-LLM/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian                                                                                                                      |
 | [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English                                                                                                                       |
 | [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding. | English                                                                                                                       |
 | [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French. | French                                                                                                                        |
@@ -50,6 +51,7 @@
 | [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities. | English                                                                                                                       |
 | [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification. | English                                                                                                                       |
 | [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English                                                                                                                       |
+| [groundcocoa](groundcocoa/README.md)                                           | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English                                                                                                                       |
 | [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge. | Korean                                                                                                                        |
 | [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English                                                                                                              |
 | [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English                                                                                                                       |
@@ -85,7 +87,7 @@
 | [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
 | [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English                                                                                                                       |
 | [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English                                                                                                                       |
-| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                   | English |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                   | English |
 | [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous. | English                                                                                                                       |
 | model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. |                                                                                                                               |
 | [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English  

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -14,6 +14,8 @@ from lm_eval.tasks.mmlu_pro.utils import doc_to_text

 GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())

+eval_logger = logging.getLogger(__name__)
+

 def convert_mcq_to_generative(cfg: dict):
    prompt = """Given the following question and candidate answers, choose the correct answer."""
@@ -71,15 +73,14 @@ class TaskManager:

    def __init__(
        self,
-        verbosity="INFO",
+        verbosity: Optional[str] = None,
        include_path: Optional[Union[str, List]] = None,
        include_defaults: bool = True,
        mcq_to_generative: bool = False,
    ) -> None:
-        self.verbosity = verbosity
+        if verbosity is not None:
+            utils.setup_logging(verbosity)
        self.include_path = include_path
-        self.logger = utils.eval_logger
-        self.logger.setLevel(getattr(logging, f"{verbosity}"))

        self._task_index = self.initialize_tasks(
            include_path=include_path, include_defaults=include_defaults
@@ -513,7 +514,7 @@ class TaskManager:
                            "yaml_path": -1,
                        }
                    elif tasks_and_groups[tag]["type"] != "tag":
-                        self.logger.info(
+                        eval_logger.info(
                            f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
                            "This may affect tasks you want to call."
                        )
@@ -576,7 +577,7 @@ class TaskManager:
                            config, task, tasks_and_groups, print_info
                        )
                    else:
-                        self.logger.debug(f"File {f} in {root} could not be loaded")
+                        eval_logger.debug(f"File {f} in {root} could not be loaded")

        return tasks_and_groups


--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -10,7 +10,7 @@ import yaml
 from tqdm import tqdm


-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)


 SUBJECTS = {

--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
@@ -58,3 +58,6 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Changelog
+version 2.0: (2025-Feb-14) set target delimiter to "" as the targets already start with a space.
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -8,11 +8,12 @@ validation_split: validation
 test_split: null
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
+target_delimiter: ""
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 1.0
+  version: 2.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/README.md
+++ b/lm_eval/tasks/basque_bench/README.md
@@ -5,14 +5,16 @@
 BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.

 The new evaluation datasets included in BasqueBench are:
-| Task          | Category       | Homepage  |
-|:-------------:|:-----:|:-----:|
-| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
-| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
-| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
-| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
+| Task     | Category                   | Homepage                                      |
+|:--------:|:--------------------------:|:---------------------------------------------:|
+| ARC_eu   | Question Answering         | https://huggingface.co/datasets/HiTZ/ARC-eu   |
+| MGSM_eu  | Math                       | https://huggingface.co/datasets/HiTZ/MGSM-eu  |
+| PAWS_eu  | Paraphrasing               | https://huggingface.co/datasets/HiTZ/PAWS-eu  |
+| PIQA_eu  | Question Answering         | https://huggingface.co/datasets/HiTZ/PIQA-eu  |
+| WNLI_eu  | Natural Language Inference | https://huggingface.co/datasets/HiTZ/WNLI-eu  |
+| XCOPA_eu | Commonsense Reasoning      | https://huggingface.co/datasets/HiTZ/XCOPA-eu |

-The datasets included in BasqueBench that have been made public in previous pubications are:
+The datasets included in BasqueBench that have been made public in previous publications are:

 | Task          | Category       | Paper title          | Homepage  |
 |:-------------:|:-----:|:-------------:|:-----:|
@@ -28,7 +30,40 @@ The datasets included in BasqueBench that have been made public in previous pubi


 ### Citation
-Paper for BasqueBench coming soon.
+
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
+    author = "Baucells, Irene  and
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
+}
+```

 ### Groups and Tasks

@@ -40,6 +75,8 @@ Paper for BasqueBench coming soon.
 #### Tasks

 The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
+  - `arc_eu_challenge`
+  - `arc_eu_easy`
  - `belebele_eus_Latn`
  - `eus_exams_eu`
  - `eus_proficiency`
@@ -64,6 +101,7 @@ The following tasks evaluate tasks on BasqueBench dataset using various scoring
  - `flores_pt-eu`
  - `mgsm_direct_eu`
  - `mgsm_native_cot_eu`
+  - `paws_eu`
  - `piqa_eu`
  - `qnlieu`
  - `wnli_eu`

--- a/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+++ b/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+include: arc_eu_easy.yaml
+task: arc_eu_challenge
+dataset_name: ARC-Challenge
--- a/lm_eval/tasks/basque_bench/arc_eu_easy.yaml
+++ b/lm_eval/tasks/basque_bench/arc_eu_easy.yaml
+task: arc_eu_easy
+dataset_path: HiTZ/ARC-eu
+dataset_name: ARC-Easy
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: "Galdera: {{question}}\nErantzuna:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Galdera: {{question}}\nErantzuna:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0