Merge branch 'bos' into mrl

79b31dad · Baber · cbb8f5a4 · 7e5f909b · 79b31dad · 79b31dad
Commit 79b31dad authored Oct 15, 2025 by Baber
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -44,11 +44,11 @@ jobs:
          echo "One or more test file(s) has changed."
          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
-      - name: Set up Python 3.9
+      - name: Set up Python 3.10
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        uses: actions/setup-python@v5
        with:
-          python-version: 3.9
+          python-version: '3.10'
          cache: 'pip'
          cache-dependency-path: pyproject.toml
      - name: Install dependencies

--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,10 +22,10 @@ jobs:
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
-      - name: Set up Python 3.9
+      - name: Set up Python 3.10
        uses: actions/setup-python@v5
        with:
-          python-version: 3.9
+          python-version: '3.10'
          cache: pip
          cache-dependency-path: pyproject.toml
      - name: Pre-Commit
@@ -39,7 +39,7 @@ jobs:
    strategy:
      fail-fast: true
      matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
    timeout-minutes: 30
    steps:
      - name: Checkout Code

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
+from __future__ import annotations
 import abc
 import hashlib
 import json
 import logging
 import os
-from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any, TypeVar
 from tqdm import tqdm
@@ -31,7 +34,7 @@ class LM(abc.ABC):
        # set rank and world size to a single process, by default.
        self._rank = 0
        self._world_size = 1
-        self.cache_hook: "CacheHook" = CacheHook(None)
+        self.cache_hook: CacheHook = CacheHook(None)
    @abc.abstractmethod
    def loglikelihood(self, requests) -> list[tuple[float, bool]]:
@@ -137,7 +140,7 @@ class LM(abc.ABC):
    @classmethod
    def create_from_arg_string(
-        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+        cls: type[T], arg_string: str, additional_config: dict | None = None
    ) -> T:
        """
        Creates an instance of the LM class using the given argument string and additional config.
@@ -156,7 +159,7 @@ class LM(abc.ABC):
    @classmethod
    def create_from_arg_obj(
-        cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None
+        cls: type[T], arg_dict: dict, additional_config: dict | None = None
    ) -> T:
        """
        Creates an instance of the LM class using the given arg_obj
@@ -199,7 +202,7 @@ class LM(abc.ABC):
            "To use this model with chat templates, please implement the 'tokenizer_name' property."
        )
-    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+    def chat_template(self, chat_template: bool | str = False) -> str | None:
        """Returns the chat template structure for user/assistant messages if a template is provided.
        This method is intended to be overridden in a subclass to define a specific chat template format.
        For models that do not support chat templates, this method returns None by default.
@@ -207,7 +210,7 @@ class LM(abc.ABC):
        return ""
-    def set_cache_hook(self, cache_hook: "CacheHook") -> None:
+    def set_cache_hook(self, cache_hook: CacheHook) -> None:
        self.cache_hook = cache_hook
@@ -218,9 +221,9 @@ def hash_args(attr: str, args: Iterable[Any]) -> str:
 class CacheHook:
-    def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
+    def __init__(self, cachinglm: CachingLM | None) -> None:
        if cachinglm is None:
-            self.dbdict: Optional["SqliteDict"] = None
+            self.dbdict: SqliteDict | None = None
            return
        self.dbdict = cachinglm.dbdict
@@ -258,7 +261,7 @@ class CachingLM:
            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
            return lm_attr
-        def _fn(requests: list["Instance"]) -> list["Instance"]:
+        def _fn(requests: list[Instance]) -> list[Instance]:
            res = []
            remaining_reqs = []
            warned = False
@@ -313,7 +316,7 @@ class CachingLM:
        return _fn
-    def get_cache_hook(self) -> "CacheHook":
+    def get_cache_hook(self) -> CacheHook:
        return CacheHook(self)
@@ -324,10 +327,11 @@ class TemplateLM(LM):
    """
    tokenizer = None
+    backend = "causal"
    @property
    @abc.abstractmethod
-    def eot_token_id(self):
+    def eot_token_id(self) -> int:
        pass
    @property
@@ -336,7 +340,9 @@ class TemplateLM(LM):
        return self.eot_token_id
    @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs) -> list[int]:
+    def tok_encode(
+        self, string: str, add_special_tokens: bool | None = None, **kwargs
+    ) -> list[int]:
        """
        Tokenize a string using the model's tokenizer and return a list of token IDs.
        """
@@ -344,45 +350,100 @@ class TemplateLM(LM):
    @abc.abstractmethod
    def _loglikelihood_tokens(
-        self, requests: list["Instance"], **kwargs
+        self, requests: list[Instance], **kwargs
    ) -> list[tuple[float, bool]]:
        pass
    def _encode_pair(
        self, context: str, continuation: str
    ) -> tuple[list[int], list[int]]:
-        import transformers
+        """
+        Encode a context-continuation pair into separate token ID lists.
+        This method handles the tokenization of context and continuation strings while
+        preserving proper boundary handling. Trailing spaces in the context are moved
+        to the beginning of the continuation to ensure correct tokenization at the
+        word boundary.
+        For Seq2Seq models (encoder-decoder), context and continuation are encoded
+        separately. For other model types (decoder-only), the full sequence is encoded
+        together to ensure proper tokenization, then split at the context boundary.
+        :param context: str
+            The context string. Can be empty (will be handled by the caller).
+        :param continuation: str
+            The continuation string to be scored.
+        :return: tuple[list[int], list[int]]
+            A tuple of (context_enc, continuation_enc) where:
+            - context_enc: Token IDs for the context
+            - continuation_enc: Token IDs for the continuation
+        Note:
+            This method does NOT handle empty context. The caller should
+            handle empty context (see loglikelihood method).
+        """
+        assert context, "Context cannot be empty!"
        n_spaces = len(context) - len(context.rstrip())
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
            context = context[:-n_spaces]
-        model_class = getattr(self, "AUTO_MODEL_CLASS", None)
+        if self.backend == "causal":
-        if model_class == transformers.AutoModelForSeq2SeqLM:
-            context_enc = self.tok_encode(context)
-            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
-        else:
            whole_enc = self.tok_encode(context + continuation)
            context_enc = self.tok_encode(context)
            context_enc_len = len(context_enc)
            continuation_enc = whole_enc[context_enc_len:]
+        else:
+            # for SEQ2SEQ case we need to encode separately
+            context_enc = self.tok_encode(context)
+            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
        return context_enc, continuation_enc
    def loglikelihood(
-        self, requests: list["Instance"], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
    ) -> list[tuple[float, bool]]:
+        """
+        Compute log-likelihood of generating continuations from contexts.
+        This is the concrete implementation for TemplateLM and its subclasses.
+        It tokenizes context-continuation pairs and delegates scoring to
+        _loglikelihood_tokens.
+        **IMPORTANT**: This method is expected to handle empty context strings.
+        When context is empty (""), it uses the model's prefix_token_id (typically
+        BOS or EOS token) as context. If the continuation already starts with the
+        prefix token, it reuses that token as context instead of duplicating it.
+        :param requests: list[Instance]
+            List of Instance objects with property `args` returning (context, continuation) tuples.
+        :param disable_tqdm: bool
+            Whether to disable the progress bar in _loglikelihood_tokens.
+        :return: list[tuple[float, bool]]
+            List of (log_prob, is_greedy) tuples for each request.
+        Implementation details:
+            - Empty context: Uses prefix_token_id (BOS/EOS) as context
+            - Non-empty context: Uses _encode_pair for proper tokenization
+            - Avoids token duplication when continuation starts with prefix_token_id
+        """
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
-                # BOS or EOS as context
+                continuation_enc = self.tok_encode(
+                    continuation, add_special_tokens=False
+                )
+                # BOS or EOS as context: handle when context is empty -> (context + continuation) -> (BOS + continuation
                context_enc, continuation_enc = (
-                    [self.prefix_token_id],
+                    ([self.prefix_token_id], continuation_enc)
-                    self.tok_encode(continuation),
+                    if self.prefix_token_id != continuation_enc[0]
+                    else (continuation_enc[:1], continuation_enc[1:])
                )
+                # BOS or EOS as context
            else:
                context_enc, continuation_enc = self._encode_pair(context, continuation)
@@ -400,7 +461,7 @@ class TemplateLM(LM):
    def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
        pass
-    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+    def chat_template(self, chat_template: bool | str = False) -> str | None:
        """
        Set and get the appropriate chat template for the model.
        This method sets the tokenizer's chat_template and returns the template string for reproducibility.

--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -114,7 +114,7 @@ class TemplateAPI(TemplateLM):
        # however the requests can be sent as a string if the API doesn't support token inputs.
        # use tokenized_requests=False
        tokenizer_backend: Optional[
-            Literal["tiktoken", "huggingface", "None", "none"]
+            Literal["tiktoken", "huggingface", "remote", "None", "none"]
        ] = "huggingface",
        truncate: bool = False,
        # number of concurrent requests. More useful if not batching
@@ -132,6 +132,8 @@ class TemplateAPI(TemplateLM):
        revision: Optional[str] = "main",
        use_fast_tokenizer: bool = True,
        verify_certificate: bool = True,
+        ca_cert_path: Optional[str] = None,
+        auth_token: Optional[str] = None,
        eos_string: str = None,
        # timeout in seconds
        timeout: int = 300,
@@ -182,6 +184,8 @@ class TemplateAPI(TemplateLM):
        self.tokenized_requests = tokenized_requests
        self.max_retries = int(max_retries)
        self.verify_certificate = verify_certificate
+        self.ca_cert_path = ca_cert_path
+        self.auth_token = auth_token
        self._eos_string = eos_string
        self.timeout = int(timeout)
        self.max_images = int(max_images)
@@ -218,6 +222,21 @@ class TemplateAPI(TemplateLM):
                            f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
                            "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
                        )
+                elif self.tokenizer_backend == "remote":
+                    from lm_eval.utils import RemoteTokenizer
+                    if not self.base_url:
+                        raise ValueError(
+                            "base_url is required for remote tokenizer backend"
+                        )
+                    self.tokenizer = RemoteTokenizer(
+                        self.base_url,
+                        self.timeout,
+                        self.verify_certificate,
+                        self.ca_cert_path,
+                        self.auth_token,
+                    )
+                    eval_logger.info(f"Using remote tokenizer from {self.base_url}")
            else:
                import transformers
@@ -310,7 +329,7 @@ class TemplateAPI(TemplateLM):
    def apply_chat_template(
        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
-    ) -> Union[str, JsonChatStr]:
+    ) -> Union[str, JsonChatStr, List[Dict]]:
        """Applies a chat template to a list of chat history between user and model."""
        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
            return self.tokenizer.apply_chat_template(
@@ -319,6 +338,8 @@ class TemplateAPI(TemplateLM):
                add_generation_prompt=add_generation_prompt,
                continue_final_message=not add_generation_prompt,
            )
+        elif self.tokenizer_backend == "remote" and self.tokenized_requests:
+            return chat_history
        else:
            # bit of a hack. We'll load back before sending to the API
            return JsonChatStr(
@@ -337,6 +358,8 @@ class TemplateAPI(TemplateLM):
                return self.tokenizer.eos_token_id
            elif self.tokenizer_backend == "tiktoken":
                return self.tokenizer.eot_token
+            elif self.tokenizer_backend == "remote":
+                return self.tokenizer.eos_token_id
    @cached_property
    def eos_string(self) -> Optional[str]:
@@ -347,6 +370,8 @@ class TemplateAPI(TemplateLM):
                return self.tokenizer.eos_token
            elif self.tokenizer_backend == "tiktoken":
                return self.tokenizer.decode([self.tokenizer.eot_token])
+            elif self.tokenizer_backend == "remote":
+                return self.tokenizer.eos_token
        else:
            eval_logger.warning(
                "Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args."
@@ -364,6 +389,8 @@ class TemplateAPI(TemplateLM):
                if self.tokenizer.bos_token_id is not None:
                    return self.tokenizer.bos_token_id
                return self.tokenizer.eos_token_id
+            elif self.tokenizer_backend == "remote":
+                return self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
            else:
                return self.tokenizer.eot_token
@@ -396,7 +423,19 @@ class TemplateAPI(TemplateLM):
                    encoding = encoding[-left_truncate_len:]
            return encoding
+        elif self.tokenizer_backend == "remote":
+            if isinstance(string, str):
+                encoding = self.tokenizer.encode(string)
+            else:
+                encoding = [self.tokenizer.encode(s) for s in string]
+            if left_truncate_len:
+                if isinstance(string, str):
+                    encoding = encoding[-left_truncate_len:]
+                else:
+                    encoding = [enc[-left_truncate_len:] for enc in encoding]
+            return encoding
        else:
            try:
                encoding = self.tokenizer.encode(string)
@@ -409,6 +448,8 @@ class TemplateAPI(TemplateLM):
            return self.tokenizer.batch_decode(tokens)
        elif self.tokenizer_backend == "tiktoken":
            return self.tokenizer.decode_batch(tokens)
+        elif self.tokenizer_backend == "remote":
+            return self.tokenizer.batch_decode(tokens)
    def model_call(
        self,

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -32,6 +32,7 @@ from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
+    bos_already_added,
    clear_torch_cache,
    configure_pad_token,
    get_dtype,
@@ -84,7 +85,7 @@ class HFLM(TemplateLM):
        max_batch_size: int | None = 64,
        trust_remote_code: bool | None = False,
        use_fast_tokenizer: bool | None = True,
-        add_bos_token: bool | None = False,
+        add_bos_token: bool | None = None,
        prefix_token_id: int | None = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
@@ -258,11 +259,6 @@ class HFLM(TemplateLM):
        )
        self.add_bos_token = add_bos_token
-        if "gemma" in getattr(self.config, "model_type", ""):
-            self.add_bos_token = True
-            eval_logger.info(
-                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
-            )
        self._max_length = max_length
        self.pretrained = pretrained
@@ -744,7 +740,7 @@ class HFLM(TemplateLM):
        trust_remote_code: bool | None = False,
        use_fast_tokenizer: bool | None = True,
        gguf_file: str | None = None,
-        add_bos_token: bool | None = False,
+        add_bos_token: bool | None = None,
        subfolder: str | None = "",
    ) -> None:
        """Helper method during initialization.
@@ -763,8 +759,8 @@ class HFLM(TemplateLM):
        else:
            kwargs["use_fast"] = use_fast_tokenizer
-        if add_bos_token:
+        if add_bos_token is not None:
-            kwargs["add_bos_token"] = True
+            kwargs["add_bos_token"] = add_bos_token
        if subfolder:
            kwargs["subfolder"] = subfolder
@@ -858,23 +854,20 @@ class HFLM(TemplateLM):
    def tok_encode(
        self,
        string: str,
-        left_truncate_len: int | None = None,
        add_special_tokens: bool | None = None,
+        left_truncate_len: int | None = None,
+        **kwargs,
    ) -> list[int]:
-        """ """
        # default for None - empty dict, use predefined tokenizer param
        # used for all models except for CausalLM or predefined value
-        special_tokens_kwargs = {}
-        # by default for CausalLM - false or self.add_bos_token is set
+        special_tokens_kwargs = (
-        if add_special_tokens is None:
+            {"add_special_tokens": add_special_tokens}
-            if self.backend == "causal":
+            if (isinstance(add_special_tokens, bool))
-                special_tokens_kwargs = {
+            else {"add_special_tokens": self.add_bos_token}
-                    "add_special_tokens": False or self.add_bos_token
+            if self.add_bos_token is not None
-                }
+            else {}
-        # otherwise the method explicitly defines the value
+        )
-        else:
-            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
@@ -897,7 +890,14 @@ class HFLM(TemplateLM):
        add_special_tokens = {}
        if self.backend == "causal":
-            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+            if bos_already_added(
+                strings[0], getattr(self.tokenizer, "bos_token", None)
+            ):
+                add_special_tokens = {"add_special_tokens": False}
+            elif self.add_bos_token is not None:
+                add_special_tokens = {"add_special_tokens": self.add_bos_token}
+            else:
+                add_special_tokens = {}
        encoding = self.tokenizer(
            strings,
@@ -971,7 +971,7 @@ class HFLM(TemplateLM):
        context,
        max_length: int,
        stop: list[str],
-        **generation_kwargs: dict[str, Any],
+        **generation_kwargs,
    ) -> torch.Tensor:
        # temperature = 0.0 if not set
        # if do_sample is false and temp==0.0:

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -16,12 +16,46 @@ eval_logger = logging.getLogger(__name__)
 class LocalCompletionsAPI(TemplateAPI):
    def __init__(
        self,
-        base_url: str = None,
+        base_url=None,
-        tokenizer_backend: str = "huggingface",
+        tokenizer_backend="auto",
+        verify_certificate=True,
+        ca_cert_path=None,
+        auth_token=None,
        **kwargs,
    ):
+        # Auto-detect tokenizer backend
+        if tokenizer_backend == "auto":
+            if base_url:
+                from lm_eval.utils import check_remote_tokenizer_support
+                if check_remote_tokenizer_support(
+                    base_url,
+                    verify_certificate=verify_certificate,
+                    ca_cert_path=ca_cert_path,
+                    auth_token=auth_token,
+                ):
+                    eval_logger.info(
+                        "Auto-detected remote tokenizer support. Using remote tokenizer backend."
+                    )
+                    tokenizer_backend = "remote"
+                else:
+                    eval_logger.info(
+                        "Remote tokenizer not supported. Using huggingface tokenizer backend."
+                    )
+                    tokenizer_backend = "huggingface"
+            else:
+                eval_logger.warning(
+                    "No base_url provided. Using huggingface tokenizer backend."
+                )
+                tokenizer_backend = "huggingface"
        super().__init__(
-            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            verify_certificate=verify_certificate,
+            ca_cert_path=ca_cert_path,
+            auth_token=auth_token,
+            **kwargs,
        )
    def _create_payload(
@@ -98,20 +132,28 @@ class LocalCompletionsAPI(TemplateAPI):
 @register_model("local-chat-completions")
 class LocalChatCompletion(LocalCompletionsAPI):
+    """
+    Minimal chat-completions wrapper.
+    - Only accepts messages as list[dict].
+    - No tokenization or template logic.
+    - Use with --apply_chat_template or ensure upstream formats messages correctly.
+    """
    def __init__(
        self,
-        base_url: str = None,
+        base_url=None,
-        tokenizer_backend: str = None,
+        verify_certificate=True,
-        tokenized_requests: bool = False,
+        ca_cert_path=None,
+        auth_token=None,
        **kwargs,
    ):
-        eval_logger.warning(
-            "chat-completions endpoint requires the `--apply_chat_template` flag."
-        )
        super().__init__(
            base_url=base_url,
-            tokenizer_backend=tokenizer_backend,
+            tokenizer_backend=None,
-            tokenized_requests=tokenized_requests,
+            tokenized_requests=None,
+            verify_certificate=verify_certificate,
+            ca_cert_path=ca_cert_path,
+            auth_token=auth_token,
            **kwargs,
        )
        if self._batch_size > 1:
@@ -129,9 +171,13 @@ class LocalChatCompletion(LocalCompletionsAPI):
        eos=None,
        **kwargs,
    ) -> dict:
-        assert type(messages) is not str, (
+        assert isinstance(messages, list) and all(
-            "chat-completions require the --apply_chat_template flag."
+            isinstance(m, dict) for m in messages
+        ), (
+            "LocalChatCompletion expects messages as list[dict]. "
+            "If you see this error, ensure --apply_chat_template is set or upstream code formats messages correctly."
        )
+        gen_kwargs = gen_kwargs or {}
        gen_kwargs.pop("do_sample", False)
        if "max_tokens" in gen_kwargs:
            max_tokens = gen_kwargs.pop("max_tokens")

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -150,7 +150,7 @@ class Grouper:
 def pad_and_concat(
    max_length: int,
-    tensors: List[torch.Tensor],
+    tensors: list[torch.Tensor],
    padding_side: Literal["right", "left"] = "right",
 ):
    """
@@ -881,3 +881,7 @@ def postprocess_generated_text(
        generation = generation.split(think_end_token)[-1].lstrip()
    return generation
+def bos_already_added(sequence: str, bos_string: Optional[str]):
+    return sequence.startswith(bos_string) if bos_string is not None else False
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
+from __future__ import annotations
 import copy
 import gc
 import logging
@@ -7,7 +9,7 @@ from importlib.util import find_spec
 from multiprocessing import Process, Queue
 from queue import Empty
 from time import sleep
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Literal
 import jinja2
 from more_itertools import distribute
@@ -19,6 +21,7 @@ from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
+    bos_already_added,
    configure_pad_token,
    handle_stop_sequences,
    postprocess_generated_text,
@@ -50,10 +53,10 @@ eval_logger = logging.getLogger(__name__)
 def _vllm_mp_worker(
    model_args: dict,
-    sampling_params: list["SamplingParams"],
+    sampling_params: list[SamplingParams],
    requests: list[list[int]],
-    lora_request: "LoRARequest",
+    lora_request: LoRARequest,
-    result_queue: "Queue",
+    result_queue: Queue,
    dp_size: int,
    local_dp_rank: int,
    dp_master_port: int,
@@ -113,18 +116,18 @@ class VLLM(TemplateLM):
        self,
        pretrained: str,
        dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
-        revision: Optional[str] = None,
+        revision: str | None = None,
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: bool | None = False,
-        tokenizer: Optional[str] = None,
+        tokenizer: str | None = None,
        tokenizer_mode: Literal["auto", "slow"] = "auto",
-        tokenizer_revision: Optional[str] = None,
+        tokenizer_revision: str | None = None,
-        add_bos_token: Optional[bool] = False,
+        add_bos_token: bool | None = False,
-        prefix_token_id: Optional[int] = None,
+        prefix_token_id: int | None = None,
        tensor_parallel_size: int = 1,
-        quantization: Optional[str] = None,
+        quantization: str | None = None,
        max_gen_toks: int = 256,
        swap_space: int = 4,
-        batch_size: Union[str, int] = 1,
+        batch_size: str | int = 1,
        max_batch_size=None,
        max_length: int = None,
        max_model_len: int = None,
@@ -134,9 +137,9 @@ class VLLM(TemplateLM):
        lora_local_path: str = None,
        # VLLM: enable thinking tags in the prompt.
        enable_thinking: bool = True,
-        chat_template_args: Optional[dict] = None,
+        chat_template_args: dict | None = None,
        # End marker for thinking tags - splits to get response after this token (if provided).
-        think_end_token: Optional[str] = None,
+        think_end_token: str | None = None,
        max_lora_rank: int = 16,
        **kwargs,
    ):
@@ -195,11 +198,7 @@ class VLLM(TemplateLM):
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")
-        if "gemma" in pretrained.lower():
+        self.add_bos_token = add_bos_token
-            add_bos_token = True
-            eval_logger.info(
-                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
-            )
        from transformers import AutoConfig
@@ -211,14 +210,17 @@ class VLLM(TemplateLM):
            tokenizer_mode=tokenizer_mode,
            trust_remote_code=trust_remote_code,
            revision=tokenizer_revision,
-            add_bos_token=add_bos_token,
+            **(
+                {"add_bos_token": self.add_bos_token}
+                if self.add_bos_token is not None
+                else {}
+            ),
        )
        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
        self.chat_template_args = chat_template_args or {}
        self.enable_thinking = self.chat_template_args.pop(
            "enable_thinking", enable_thinking
        )
-        self.add_bos_token = add_bos_token
        if parse_version(version("vllm")) >= parse_version("0.8.3"):
            kwargs_resolve_hf_chat_template = {
@@ -265,7 +267,7 @@ class VLLM(TemplateLM):
            self.lora_request = None
    @property
-    def eot_token_id(self):
+    def eot_token_id(self) -> int | None:
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id
@@ -300,7 +302,7 @@ class VLLM(TemplateLM):
        return self._max_gen_toks
    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+        self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
    ) -> str:
        """
        Method to apply a chat template to a list of chat history between user and model.
@@ -337,18 +339,27 @@ class VLLM(TemplateLM):
    def tok_encode(
        self,
-        string: Union[str, List[str]],
+        string: str | list[str],
-        left_truncate_len: int = None,
+        left_truncate_len: int | None = None,
-        add_special_tokens: bool = False,
+        add_special_tokens: bool | None = None,
        truncation: bool = False,
-    ) -> Union[List[int], List[List[int]]]:
+    ) -> list[int] | list[list[int]]:
-        if not add_special_tokens:
+        add_special_kwargs = (
-            add_special_tokens = False or self.add_bos_token
+            {"add_special_tokens": add_special_tokens or self.add_bos_token}
-        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            if (add_special_tokens is not None or self.add_bos_token is not None)
+            else {}
+        )
+        # handle chat template
+        if bos_already_added(
+            string[0] if isinstance(string, list) else string, self.tokenizer.bos_token
+        ):
+            add_special_kwargs = {"add_special_tokens": False}
+        encoding: list[list[int]] | list[int] = self.tokenizer(
            string,
-            add_special_tokens=add_special_tokens,
            truncation=truncation,
            return_attention_mask=False,
+            **add_special_kwargs,
        ).input_ids
        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
@@ -362,15 +373,15 @@ class VLLM(TemplateLM):
    def _model_generate(
        self,
-        requests: List[List[int]] = None,
+        requests: list[list[int]],
        generate: bool = False,
-        sampling_params: Union[List["SamplingParams"], "SamplingParams", None] = None,
+        sampling_params: list[SamplingParams] | SamplingParams | None = None,
    ):
        if not generate or sampling_params is None:
            sampling_params = SamplingParams(
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
-        if not isinstance(sampling_params, List):
+        if not isinstance(sampling_params, list):
            sampling_params = [sampling_params] * len(requests)
        if self.data_parallel_size > 1 and not self.V1:
            # vLLM hangs if resources are set in ray.remote
@@ -379,9 +390,9 @@ class VLLM(TemplateLM):
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
-                sampling_params: List["SamplingParams"],
+                sampling_params: list[SamplingParams],
-                requests: List[List[int]],
+                requests: list[list[int]],
-                lora_request: "LoRARequest",
+                lora_request: LoRARequest,
            ):
                llm = LLM(**model_args)
                return llm.generate(
@@ -487,8 +498,8 @@ class VLLM(TemplateLM):
            return outputs
    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
+    ) -> list[float]:
        adaptive_batch_size = None
        if self.batch_size == "auto":
            adaptive_batch_size = len(requests)
@@ -503,7 +514,7 @@ class VLLM(TemplateLM):
                disable=(disable_tqdm or (self.rank != 0)),
            )
        ):
-            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+            rolling_token_windows: list[tuple[list[int], list[int]]] = list(
                map(
                    make_disjoint_window,
                    get_rolling_token_windows(
@@ -556,16 +567,14 @@ class VLLM(TemplateLM):
        return loglikelihoods
    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
+    ) -> list[str]:
        res = []
        # batch tokenize contexts
        context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding: List[List[int]] = self.tok_encode(
+        context_encoding = self.tok_encode(context)
-            context, add_special_tokens=self.add_bos_token
+        reqs = [
-        )
-        requests = [
            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
        ]
@@ -579,7 +588,7 @@ class VLLM(TemplateLM):
            return -len(_requests[0][1]), _requests[0][0]
        re_ords = Collator(
-            requests,
+            reqs,
            _collate_gen,
            group_by=None,
        )
@@ -588,7 +597,7 @@ class VLLM(TemplateLM):
        )
        pbar = tqdm(
-            total=len(requests),
+            total=len(reqs),
            disable=(disable_tqdm or (self.rank != 0)),
            desc="Running generate_until requests",
        )
@@ -656,9 +665,9 @@ class VLLM(TemplateLM):
    def _loglikelihood_tokens(
        self,
-        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        requests: list[tuple[tuple[str, str], list[int], list[int]]],
        disable_tqdm: bool = False,
-    ) -> List[Tuple[float, bool]]:
+    ) -> list[tuple[float, bool]]:
        res = []
        def _collate(x):
@@ -717,7 +726,7 @@ class VLLM(TemplateLM):
        return re_ord.get_original(res)
    @staticmethod
-    def _parse_logprobs(tokens: List, outputs, ctxlen: int) -> Tuple[float, bool]:
+    def _parse_logprobs(tokens: list, outputs, ctxlen: int) -> tuple[float, bool]:
        """Process logprobs and tokens.
        :param tokens: list

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
--- a/lm_eval/tasks/bangla/README.md
+++ b/lm_eval/tasks/bangla/README.md
+# Titulm Bangla MMLU
+This repository contains resources related to **Titulm Bangla MMLU**, a benchmark dataset designed for evaluating Bangla language models. The dataset is used for training, development, and comparative evaluation of language models in the Bangla language.
+---
+## Overview
+**TituLLMs** is a family of Bangla large language models (LLMs) with comprehensive benchmarking designed to advance natural language processing for the Bangla language. The benchmark dataset `Titulm Bangla MMLU` covers multiple-choice questions across a diverse range of topics in Bangla.
+This dataset is primarily used to train, validate, and evaluate Bangla language models and compare their performance with other existing models.
+For more details, please refer to the original research paper:  
+[https://arxiv.org/abs/2502.11187](https://arxiv.org/abs/2502.11187)
+---
+## Dataset
+The `Titulm Bangla MMLU` dataset can be found on Hugging Face:  
+[https://huggingface.co/datasets/hishab/titulm-bangla-mmlu](https://huggingface.co/datasets/hishab/titulm-bangla-mmlu)
+This dataset was used as a benchmark in the development and evaluation of TituLLMs and related models.
+---
+## Usage
+The dataset is intended for use within the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) repository to evaluate and compare the performance of Bangla language models.
+---
+## Note: The dataset can also be used to evaluate other models
+### Other datasets like boolq, openbookqa ... soon to be added
+## Citation
+If you use this dataset or model, please cite the original paper:
+```bibtex
+@misc{nahin2025titullmsfamilybanglallms,
+      title={TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking},
+      author={Shahriar Kabir Nahin and Rabindra Nath Nandi and Sagor Sarker and Quazi Sarwar Muhtaseem and Md Kowsher and Apu Chandraw Shill and Md Ibrahim and Mehadi Hasan Menon and Tareq Al Muntasir and Firoj Alam},
+      year={2025},
+      eprint={2502.11187},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2502.11187},
+}
--- a/lm_eval/tasks/bangla/bangla_mmlu_test.yaml
+++ b/lm_eval/tasks/bangla/bangla_mmlu_test.yaml
+task: bangla_mmlu
+dataset_path: hishab/titulm-bangla-mmlu
+dataset_name: all
+description: "The following are multiple choice questions (with answers) about range of topics in Bangla"
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}} A. {{options[0]}} B. {{options[1]}} C. {{options[2]}} D. {{options[3]}} Answer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/longbench2/README.md
+++ b/lm_eval/tasks/longbench2/README.md
+# LongBench v2
+### Paper
+Title: `LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks`
+Abstract: `This paper introduces LongBench v2, a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. LongBench v2 consists of 503 challenging multiple-choice questions, with contexts ranging from 8k to 2M words, across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding. To ensure the breadth and the practicality, we collect data from nearly 100 highly educated individuals with diverse professional backgrounds. We employ both automated and manual review processes to maintain high quality and difficulty, resulting in human experts achieving only 53.7% accuracy under a 15-minute time constraint. Our evaluation reveals that the best-performing model, when directly answers the questions, achieves only 50.1% accuracy. In contrast, the o1-preview model, which includes longer reasoning, achieves 57.7%, surpassing the human baseline by 4%. These results highlight the importance of enhanced reasoning ability and scaling inference-time compute to tackle the long-context challenges in LongBench v2.`
+Homepage: `https://github.com/THUDM/LongBench`
+### Citation
+```
+@article{bai2024longbench2,
+  title={LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks},
+  author={Yushi Bai and Shangqing Tu and Jiajie Zhang and Hao Peng and Xiaozhi Wang and Xin Lv and Shulin Cao and Jiazheng Xu and Lei Hou and Yuxiao Dong and Jie Tang and Juanzi Li},
+  journal={arXiv preprint arXiv:2412.15204},
+  year={2024}
+}
+```
+### Groups, Tags, and Tasks
+#### Groups
+* `longbench2_single`: Single-document QA tasks requiring comprehension of documents across various domains (government, legal, literature, finance, academic, detective stories, and order of events)
+* `longbench2_multi`: Multi-document QA tasks requiring information synthesis and reasoning across multiple documents in government, academic, finance, and news
+* `longbench2_incontext`: Long in-context learning tasks including user guide comprehension, translation with examples, and many-shot learning scenarios
+* `longbench2_history`: Long-dialogue history understanding tasks involving agent conversations and dialogue history comprehension
+* `longbench2_structured`: Long structured data understanding tasks for graph and table data processing
+#### Tags
+* `longbench2`: Run the full benchmark with 503 multiple-choice questions (8k-2M words) testing understanding and reasoning on long-context tasks
+#### Tasks
+**Single-Document QA:**
+* `longbench2_govt_single`: Question answering from single government documents
+* `longbench2_legal_single`: Question answering from single legal documents
+* `longbench2_lit_single`: Question answering from single literature/literary documents
+* `longbench2_fin_single`: Question answering from single financial documents
+* `longbench2_academic_single`: Question answering from single academic papers and research documents
+* `longbench2_detective`: Question answering from detective stories requiring logical reasoning
+* `longbench2_event_order`: Temporal reasoning tasks about event ordering in narratives
+**Multi-Document QA:**
+* `longbench2_govt_multi`: Question answering across multiple government documents
+* `longbench2_academic_multi`: Question answering across multiple academic papers
+* `longbench2_fin_multi`: Question answering across multiple financial documents
+* `longbench2_news_multi`: Question answering across multiple news articles
+**Long In-context Learning:**
+* `longbench2_user_guide`: Comprehension and application of user guide instructions
+* `longbench2_translate`: Translation tasks in new languages with long examples
+* `longbench2_many_shot`: Few-shot learning with many examples in context
+**Long-dialogue History Understanding:**
+* `longbench2_agent_history`: Understanding and reasoning over extended agent conversation histories
+* `longbench2_dialogue_history`: Understanding and reasoning over long dialogue exchanges
+**Code Repository Understanding:**
+* `longbench2_code`: Question answering on code repositories requiring codebase comprehension
+**Long Structured Data Understanding:**
+* `longbench2_graph`: Understanding and reasoning over graph-structured data
+* `longbench2_table`: Understanding and reasoning over tabular data
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/longbench2/_longbench2.yaml
+++ b/lm_eval/tasks/longbench2/_longbench2.yaml
+group: longbench2
+task:
+  - longbench2_history_tasks
+  - longbench2_incontext_tasks
+  - longbench2_multi_tasks
+  - longbench2_single_tasks
+  - longbench2_structured_tasks
+  - longbench2_code
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/longbench2/_longbench2_history.yaml
+++ b/lm_eval/tasks/longbench2/_longbench2_history.yaml
+group: longbench2_history
+group_alias: "Long-dialogue History Understanding"
+task:
+  - longbench2_agent_history
+  - longbench2_dialogue_history
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/longbench2/_longbench2_incontext.yaml
+++ b/lm_eval/tasks/longbench2/_longbench2_incontext.yaml
+group: longbench2_incontext
+group_alias: "Long In-context Learning"
+task:
+  - longbench2_user_guide
+  - longbench2_translate
+  - longbench2_many_shot
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/longbench2/_longbench2_multi.yaml
+++ b/lm_eval/tasks/longbench2/_longbench2_multi.yaml
+group: longbench2_multi
+group_alias: "Multi-Document QA"
+task:
+  - longbench2_govt_multi
+  - longbench2_academic_multi
+  - longbench2_fin_multi
+  - longbench2_news_multi
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/longbench2/_longbench2_single.yaml
+++ b/lm_eval/tasks/longbench2/_longbench2_single.yaml
+group: longbench2_single
+group_alias: "Single-Document QA"
+task:
+  - longbench2_govt_single
+  - longbench2_legal_single
+  - longbench2_lit_single
+  - longbench2_fin_single
+  - longbench2_event_order
+  - longbench2_academic_single
+  - longbench2_detective
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/longbench2/_longbench2_structured.yaml
+++ b/lm_eval/tasks/longbench2/_longbench2_structured.yaml
+group: longbench2_structured
+group_alias: "Long Structured Data Understanding"
+task:
+  - longbench2_graph
+  - longbench2_table
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/longbench2/_longbench_common_yaml
+++ b/lm_eval/tasks/longbench2/_longbench_common_yaml
+dataset_path: recursal/longbench-v2
+test_split: train
+output_type: multiple_choice
+doc_to_text: "Please read the following text and answer the question below.\n\n<text>\n{{context}}\n</text>\n\nWhat is the correct answer to this question: {{question.strip()}}\nChoices:\n(A) {{choices[0]}}\n(B) {{choices[1]}}\n(C) {{choices[2]}}\n(D) {{choices[3]}}\n\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/longbench2/academic_multi_doc.yaml
+++ b/lm_eval/tasks/longbench2/academic_multi_doc.yaml
+include: _longbench_common_yaml
+tag:
+  - longbench2_tasks
+  - longbench2_multi_tasks
+task: longbench2_academic_multi
+dataset_name: academic_multi