Commit 79b31dad authored by Baber's avatar Baber
Browse files

Merge branch 'bos' into mrl

parents cbb8f5a4 7e5f909b
......@@ -44,11 +44,11 @@ jobs:
echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9
- name: Set up Python 3.10
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'
cache: 'pip'
cache-dependency-path: pyproject.toml
- name: Install dependencies
......
......@@ -22,10 +22,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Set up Python 3.9
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'
cache: pip
cache-dependency-path: pyproject.toml
- name: Pre-Commit
......@@ -39,7 +39,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11", "3.12"]
timeout-minutes: 30
steps:
- name: Checkout Code
......
from __future__ import annotations
import abc
import hashlib
import json
import logging
import os
from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, TypeVar
from tqdm import tqdm
......@@ -31,7 +34,7 @@ class LM(abc.ABC):
# set rank and world size to a single process, by default.
self._rank = 0
self._world_size = 1
self.cache_hook: "CacheHook" = CacheHook(None)
self.cache_hook: CacheHook = CacheHook(None)
@abc.abstractmethod
def loglikelihood(self, requests) -> list[tuple[float, bool]]:
......@@ -137,7 +140,7 @@ class LM(abc.ABC):
@classmethod
def create_from_arg_string(
cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
cls: type[T], arg_string: str, additional_config: dict | None = None
) -> T:
"""
Creates an instance of the LM class using the given argument string and additional config.
......@@ -156,7 +159,7 @@ class LM(abc.ABC):
@classmethod
def create_from_arg_obj(
cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None
cls: type[T], arg_dict: dict, additional_config: dict | None = None
) -> T:
"""
Creates an instance of the LM class using the given arg_obj
......@@ -199,7 +202,7 @@ class LM(abc.ABC):
"To use this model with chat templates, please implement the 'tokenizer_name' property."
)
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
def chat_template(self, chat_template: bool | str = False) -> str | None:
"""Returns the chat template structure for user/assistant messages if a template is provided.
This method is intended to be overridden in a subclass to define a specific chat template format.
For models that do not support chat templates, this method returns None by default.
......@@ -207,7 +210,7 @@ class LM(abc.ABC):
return ""
def set_cache_hook(self, cache_hook: "CacheHook") -> None:
def set_cache_hook(self, cache_hook: CacheHook) -> None:
self.cache_hook = cache_hook
......@@ -218,9 +221,9 @@ def hash_args(attr: str, args: Iterable[Any]) -> str:
class CacheHook:
def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
def __init__(self, cachinglm: CachingLM | None) -> None:
if cachinglm is None:
self.dbdict: Optional["SqliteDict"] = None
self.dbdict: SqliteDict | None = None
return
self.dbdict = cachinglm.dbdict
......@@ -258,7 +261,7 @@ class CachingLM:
eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
return lm_attr
def _fn(requests: list["Instance"]) -> list["Instance"]:
def _fn(requests: list[Instance]) -> list[Instance]:
res = []
remaining_reqs = []
warned = False
......@@ -313,7 +316,7 @@ class CachingLM:
return _fn
def get_cache_hook(self) -> "CacheHook":
def get_cache_hook(self) -> CacheHook:
return CacheHook(self)
......@@ -324,10 +327,11 @@ class TemplateLM(LM):
"""
tokenizer = None
backend = "causal"
@property
@abc.abstractmethod
def eot_token_id(self):
def eot_token_id(self) -> int:
pass
@property
......@@ -336,7 +340,9 @@ class TemplateLM(LM):
return self.eot_token_id
@abc.abstractmethod
def tok_encode(self, string: str, **kwargs) -> list[int]:
def tok_encode(
self, string: str, add_special_tokens: bool | None = None, **kwargs
) -> list[int]:
"""
Tokenize a string using the model's tokenizer and return a list of token IDs.
"""
......@@ -344,45 +350,100 @@ class TemplateLM(LM):
@abc.abstractmethod
def _loglikelihood_tokens(
self, requests: list["Instance"], **kwargs
self, requests: list[Instance], **kwargs
) -> list[tuple[float, bool]]:
pass
def _encode_pair(
self, context: str, continuation: str
) -> tuple[list[int], list[int]]:
import transformers
"""
Encode a context-continuation pair into separate token ID lists.
This method handles the tokenization of context and continuation strings while
preserving proper boundary handling. Trailing spaces in the context are moved
to the beginning of the continuation to ensure correct tokenization at the
word boundary.
For Seq2Seq models (encoder-decoder), context and continuation are encoded
separately. For other model types (decoder-only), the full sequence is encoded
together to ensure proper tokenization, then split at the context boundary.
:param context: str
The context string. Can be empty (will be handled by the caller).
:param continuation: str
The continuation string to be scored.
:return: tuple[list[int], list[int]]
A tuple of (context_enc, continuation_enc) where:
- context_enc: Token IDs for the context
- continuation_enc: Token IDs for the continuation
Note:
This method does NOT handle empty context. The caller should
handle empty context (see loglikelihood method).
"""
assert context, "Context cannot be empty!"
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
model_class = getattr(self, "AUTO_MODEL_CLASS", None)
if model_class == transformers.AutoModelForSeq2SeqLM:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
else:
if self.backend == "causal":
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
else:
# for SEQ2SEQ case we need to encode separately
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
return context_enc, continuation_enc
def loglikelihood(
self, requests: list["Instance"], disable_tqdm: bool = False
self, requests: list[Instance], disable_tqdm: bool = False
) -> list[tuple[float, bool]]:
"""
Compute log-likelihood of generating continuations from contexts.
This is the concrete implementation for TemplateLM and its subclasses.
It tokenizes context-continuation pairs and delegates scoring to
_loglikelihood_tokens.
**IMPORTANT**: This method is expected to handle empty context strings.
When context is empty (""), it uses the model's prefix_token_id (typically
BOS or EOS token) as context. If the continuation already starts with the
prefix token, it reuses that token as context instead of duplicating it.
:param requests: list[Instance]
List of Instance objects with property `args` returning (context, continuation) tuples.
:param disable_tqdm: bool
Whether to disable the progress bar in _loglikelihood_tokens.
:return: list[tuple[float, bool]]
List of (log_prob, is_greedy) tuples for each request.
Implementation details:
- Empty context: Uses prefix_token_id (BOS/EOS) as context
- Non-empty context: Uses _encode_pair for proper tokenization
- Avoids token duplication when continuation starts with prefix_token_id
"""
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
# BOS or EOS as context
continuation_enc = self.tok_encode(
continuation, add_special_tokens=False
)
# BOS or EOS as context: handle when context is empty -> (context + continuation) -> (BOS + continuation
context_enc, continuation_enc = (
[self.prefix_token_id],
self.tok_encode(continuation),
([self.prefix_token_id], continuation_enc)
if self.prefix_token_id != continuation_enc[0]
else (continuation_enc[:1], continuation_enc[1:])
)
# BOS or EOS as context
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
......@@ -400,7 +461,7 @@ class TemplateLM(LM):
def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
pass
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
def chat_template(self, chat_template: bool | str = False) -> str | None:
"""
Set and get the appropriate chat template for the model.
This method sets the tokenizer's chat_template and returns the template string for reproducibility.
......
......@@ -114,7 +114,7 @@ class TemplateAPI(TemplateLM):
# however the requests can be sent as a string if the API doesn't support token inputs.
# use tokenized_requests=False
tokenizer_backend: Optional[
Literal["tiktoken", "huggingface", "None", "none"]
Literal["tiktoken", "huggingface", "remote", "None", "none"]
] = "huggingface",
truncate: bool = False,
# number of concurrent requests. More useful if not batching
......@@ -132,6 +132,8 @@ class TemplateAPI(TemplateLM):
revision: Optional[str] = "main",
use_fast_tokenizer: bool = True,
verify_certificate: bool = True,
ca_cert_path: Optional[str] = None,
auth_token: Optional[str] = None,
eos_string: str = None,
# timeout in seconds
timeout: int = 300,
......@@ -182,6 +184,8 @@ class TemplateAPI(TemplateLM):
self.tokenized_requests = tokenized_requests
self.max_retries = int(max_retries)
self.verify_certificate = verify_certificate
self.ca_cert_path = ca_cert_path
self.auth_token = auth_token
self._eos_string = eos_string
self.timeout = int(timeout)
self.max_images = int(max_images)
......@@ -218,6 +222,21 @@ class TemplateAPI(TemplateLM):
f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
"Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
)
elif self.tokenizer_backend == "remote":
from lm_eval.utils import RemoteTokenizer
if not self.base_url:
raise ValueError(
"base_url is required for remote tokenizer backend"
)
self.tokenizer = RemoteTokenizer(
self.base_url,
self.timeout,
self.verify_certificate,
self.ca_cert_path,
self.auth_token,
)
eval_logger.info(f"Using remote tokenizer from {self.base_url}")
else:
import transformers
......@@ -310,7 +329,7 @@ class TemplateAPI(TemplateLM):
def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
) -> Union[str, JsonChatStr]:
) -> Union[str, JsonChatStr, List[Dict]]:
"""Applies a chat template to a list of chat history between user and model."""
if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
return self.tokenizer.apply_chat_template(
......@@ -319,6 +338,8 @@ class TemplateAPI(TemplateLM):
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
)
elif self.tokenizer_backend == "remote" and self.tokenized_requests:
return chat_history
else:
# bit of a hack. We'll load back before sending to the API
return JsonChatStr(
......@@ -337,6 +358,8 @@ class TemplateAPI(TemplateLM):
return self.tokenizer.eos_token_id
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.eot_token
elif self.tokenizer_backend == "remote":
return self.tokenizer.eos_token_id
@cached_property
def eos_string(self) -> Optional[str]:
......@@ -347,6 +370,8 @@ class TemplateAPI(TemplateLM):
return self.tokenizer.eos_token
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.decode([self.tokenizer.eot_token])
elif self.tokenizer_backend == "remote":
return self.tokenizer.eos_token
else:
eval_logger.warning(
"Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args."
......@@ -364,6 +389,8 @@ class TemplateAPI(TemplateLM):
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
elif self.tokenizer_backend == "remote":
return self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
else:
return self.tokenizer.eot_token
......@@ -396,7 +423,19 @@ class TemplateAPI(TemplateLM):
encoding = encoding[-left_truncate_len:]
return encoding
elif self.tokenizer_backend == "remote":
if isinstance(string, str):
encoding = self.tokenizer.encode(string)
else:
encoding = [self.tokenizer.encode(s) for s in string]
if left_truncate_len:
if isinstance(string, str):
encoding = encoding[-left_truncate_len:]
else:
encoding = [enc[-left_truncate_len:] for enc in encoding]
return encoding
else:
try:
encoding = self.tokenizer.encode(string)
......@@ -409,6 +448,8 @@ class TemplateAPI(TemplateLM):
return self.tokenizer.batch_decode(tokens)
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.decode_batch(tokens)
elif self.tokenizer_backend == "remote":
return self.tokenizer.batch_decode(tokens)
def model_call(
self,
......
......@@ -32,6 +32,7 @@ from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import (
Collator,
bos_already_added,
clear_torch_cache,
configure_pad_token,
get_dtype,
......@@ -84,7 +85,7 @@ class HFLM(TemplateLM):
max_batch_size: int | None = 64,
trust_remote_code: bool | None = False,
use_fast_tokenizer: bool | None = True,
add_bos_token: bool | None = False,
add_bos_token: bool | None = None,
prefix_token_id: int | None = None,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
......@@ -258,11 +259,6 @@ class HFLM(TemplateLM):
)
self.add_bos_token = add_bos_token
if "gemma" in getattr(self.config, "model_type", ""):
self.add_bos_token = True
eval_logger.info(
f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
)
self._max_length = max_length
self.pretrained = pretrained
......@@ -744,7 +740,7 @@ class HFLM(TemplateLM):
trust_remote_code: bool | None = False,
use_fast_tokenizer: bool | None = True,
gguf_file: str | None = None,
add_bos_token: bool | None = False,
add_bos_token: bool | None = None,
subfolder: str | None = "",
) -> None:
"""Helper method during initialization.
......@@ -763,8 +759,8 @@ class HFLM(TemplateLM):
else:
kwargs["use_fast"] = use_fast_tokenizer
if add_bos_token:
kwargs["add_bos_token"] = True
if add_bos_token is not None:
kwargs["add_bos_token"] = add_bos_token
if subfolder:
kwargs["subfolder"] = subfolder
......@@ -858,23 +854,20 @@ class HFLM(TemplateLM):
def tok_encode(
self,
string: str,
left_truncate_len: int | None = None,
add_special_tokens: bool | None = None,
left_truncate_len: int | None = None,
**kwargs,
) -> list[int]:
""" """
# default for None - empty dict, use predefined tokenizer param
# used for all models except for CausalLM or predefined value
special_tokens_kwargs = {}
# by default for CausalLM - false or self.add_bos_token is set
if add_special_tokens is None:
if self.backend == "causal":
special_tokens_kwargs = {
"add_special_tokens": False or self.add_bos_token
}
# otherwise the method explicitly defines the value
else:
special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
special_tokens_kwargs = (
{"add_special_tokens": add_special_tokens}
if (isinstance(add_special_tokens, bool))
else {"add_special_tokens": self.add_bos_token}
if self.add_bos_token is not None
else {}
)
encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
......@@ -897,7 +890,14 @@ class HFLM(TemplateLM):
add_special_tokens = {}
if self.backend == "causal":
add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
if bos_already_added(
strings[0], getattr(self.tokenizer, "bos_token", None)
):
add_special_tokens = {"add_special_tokens": False}
elif self.add_bos_token is not None:
add_special_tokens = {"add_special_tokens": self.add_bos_token}
else:
add_special_tokens = {}
encoding = self.tokenizer(
strings,
......@@ -971,7 +971,7 @@ class HFLM(TemplateLM):
context,
max_length: int,
stop: list[str],
**generation_kwargs: dict[str, Any],
**generation_kwargs,
) -> torch.Tensor:
# temperature = 0.0 if not set
# if do_sample is false and temp==0.0:
......
......@@ -16,12 +16,46 @@ eval_logger = logging.getLogger(__name__)
class LocalCompletionsAPI(TemplateAPI):
def __init__(
self,
base_url: str = None,
tokenizer_backend: str = "huggingface",
base_url=None,
tokenizer_backend="auto",
verify_certificate=True,
ca_cert_path=None,
auth_token=None,
**kwargs,
):
# Auto-detect tokenizer backend
if tokenizer_backend == "auto":
if base_url:
from lm_eval.utils import check_remote_tokenizer_support
if check_remote_tokenizer_support(
base_url,
verify_certificate=verify_certificate,
ca_cert_path=ca_cert_path,
auth_token=auth_token,
):
eval_logger.info(
"Auto-detected remote tokenizer support. Using remote tokenizer backend."
)
tokenizer_backend = "remote"
else:
eval_logger.info(
"Remote tokenizer not supported. Using huggingface tokenizer backend."
)
tokenizer_backend = "huggingface"
else:
eval_logger.warning(
"No base_url provided. Using huggingface tokenizer backend."
)
tokenizer_backend = "huggingface"
super().__init__(
base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
base_url=base_url,
tokenizer_backend=tokenizer_backend,
verify_certificate=verify_certificate,
ca_cert_path=ca_cert_path,
auth_token=auth_token,
**kwargs,
)
def _create_payload(
......@@ -98,20 +132,28 @@ class LocalCompletionsAPI(TemplateAPI):
@register_model("local-chat-completions")
class LocalChatCompletion(LocalCompletionsAPI):
"""
Minimal chat-completions wrapper.
- Only accepts messages as list[dict].
- No tokenization or template logic.
- Use with --apply_chat_template or ensure upstream formats messages correctly.
"""
def __init__(
self,
base_url: str = None,
tokenizer_backend: str = None,
tokenized_requests: bool = False,
base_url=None,
verify_certificate=True,
ca_cert_path=None,
auth_token=None,
**kwargs,
):
eval_logger.warning(
"chat-completions endpoint requires the `--apply_chat_template` flag."
)
super().__init__(
base_url=base_url,
tokenizer_backend=tokenizer_backend,
tokenized_requests=tokenized_requests,
tokenizer_backend=None,
tokenized_requests=None,
verify_certificate=verify_certificate,
ca_cert_path=ca_cert_path,
auth_token=auth_token,
**kwargs,
)
if self._batch_size > 1:
......@@ -129,9 +171,13 @@ class LocalChatCompletion(LocalCompletionsAPI):
eos=None,
**kwargs,
) -> dict:
assert type(messages) is not str, (
"chat-completions require the --apply_chat_template flag."
assert isinstance(messages, list) and all(
isinstance(m, dict) for m in messages
), (
"LocalChatCompletion expects messages as list[dict]. "
"If you see this error, ensure --apply_chat_template is set or upstream code formats messages correctly."
)
gen_kwargs = gen_kwargs or {}
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
......
......@@ -150,7 +150,7 @@ class Grouper:
def pad_and_concat(
max_length: int,
tensors: List[torch.Tensor],
tensors: list[torch.Tensor],
padding_side: Literal["right", "left"] = "right",
):
"""
......@@ -881,3 +881,7 @@ def postprocess_generated_text(
generation = generation.split(think_end_token)[-1].lstrip()
return generation
def bos_already_added(sequence: str, bos_string: Optional[str]):
return sequence.startswith(bos_string) if bos_string is not None else False
from __future__ import annotations
import copy
import gc
import logging
......@@ -7,7 +9,7 @@ from importlib.util import find_spec
from multiprocessing import Process, Queue
from queue import Empty
from time import sleep
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
from typing import TYPE_CHECKING, Literal
import jinja2
from more_itertools import distribute
......@@ -19,6 +21,7 @@ from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import (
Collator,
bos_already_added,
configure_pad_token,
handle_stop_sequences,
postprocess_generated_text,
......@@ -50,10 +53,10 @@ eval_logger = logging.getLogger(__name__)
def _vllm_mp_worker(
model_args: dict,
sampling_params: list["SamplingParams"],
sampling_params: list[SamplingParams],
requests: list[list[int]],
lora_request: "LoRARequest",
result_queue: "Queue",
lora_request: LoRARequest,
result_queue: Queue,
dp_size: int,
local_dp_rank: int,
dp_master_port: int,
......@@ -113,18 +116,18 @@ class VLLM(TemplateLM):
self,
pretrained: str,
dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
revision: Optional[str] = None,
trust_remote_code: Optional[bool] = False,
tokenizer: Optional[str] = None,
revision: str | None = None,
trust_remote_code: bool | None = False,
tokenizer: str | None = None,
tokenizer_mode: Literal["auto", "slow"] = "auto",
tokenizer_revision: Optional[str] = None,
add_bos_token: Optional[bool] = False,
prefix_token_id: Optional[int] = None,
tokenizer_revision: str | None = None,
add_bos_token: bool | None = False,
prefix_token_id: int | None = None,
tensor_parallel_size: int = 1,
quantization: Optional[str] = None,
quantization: str | None = None,
max_gen_toks: int = 256,
swap_space: int = 4,
batch_size: Union[str, int] = 1,
batch_size: str | int = 1,
max_batch_size=None,
max_length: int = None,
max_model_len: int = None,
......@@ -134,9 +137,9 @@ class VLLM(TemplateLM):
lora_local_path: str = None,
# VLLM: enable thinking tags in the prompt.
enable_thinking: bool = True,
chat_template_args: Optional[dict] = None,
chat_template_args: dict | None = None,
# End marker for thinking tags - splits to get response after this token (if provided).
think_end_token: Optional[str] = None,
think_end_token: str | None = None,
max_lora_rank: int = 16,
**kwargs,
):
......@@ -195,11 +198,7 @@ class VLLM(TemplateLM):
self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.")
if "gemma" in pretrained.lower():
add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
)
self.add_bos_token = add_bos_token
from transformers import AutoConfig
......@@ -211,14 +210,17 @@ class VLLM(TemplateLM):
tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code,
revision=tokenizer_revision,
add_bos_token=add_bos_token,
**(
{"add_bos_token": self.add_bos_token}
if self.add_bos_token is not None
else {}
),
)
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
self.chat_template_args = chat_template_args or {}
self.enable_thinking = self.chat_template_args.pop(
"enable_thinking", enable_thinking
)
self.add_bos_token = add_bos_token
if parse_version(version("vllm")) >= parse_version("0.8.3"):
kwargs_resolve_hf_chat_template = {
......@@ -265,7 +267,7 @@ class VLLM(TemplateLM):
self.lora_request = None
@property
def eot_token_id(self):
def eot_token_id(self) -> int | None:
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
......@@ -300,7 +302,7 @@ class VLLM(TemplateLM):
return self._max_gen_toks
def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
......@@ -337,18 +339,27 @@ class VLLM(TemplateLM):
def tok_encode(
self,
string: Union[str, List[str]],
left_truncate_len: int = None,
add_special_tokens: bool = False,
string: str | list[str],
left_truncate_len: int | None = None,
add_special_tokens: bool | None = None,
truncation: bool = False,
) -> Union[List[int], List[List[int]]]:
if not add_special_tokens:
add_special_tokens = False or self.add_bos_token
encoding: Union[List[List[int]], List[int]] = self.tokenizer(
) -> list[int] | list[list[int]]:
add_special_kwargs = (
{"add_special_tokens": add_special_tokens or self.add_bos_token}
if (add_special_tokens is not None or self.add_bos_token is not None)
else {}
)
# handle chat template
if bos_already_added(
string[0] if isinstance(string, list) else string, self.tokenizer.bos_token
):
add_special_kwargs = {"add_special_tokens": False}
encoding: list[list[int]] | list[int] = self.tokenizer(
string,
add_special_tokens=add_special_tokens,
truncation=truncation,
return_attention_mask=False,
**add_special_kwargs,
).input_ids
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
......@@ -362,15 +373,15 @@ class VLLM(TemplateLM):
def _model_generate(
self,
requests: List[List[int]] = None,
requests: list[list[int]],
generate: bool = False,
sampling_params: Union[List["SamplingParams"], "SamplingParams", None] = None,
sampling_params: list[SamplingParams] | SamplingParams | None = None,
):
if not generate or sampling_params is None:
sampling_params = SamplingParams(
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if not isinstance(sampling_params, List):
if not isinstance(sampling_params, list):
sampling_params = [sampling_params] * len(requests)
if self.data_parallel_size > 1 and not self.V1:
# vLLM hangs if resources are set in ray.remote
......@@ -379,9 +390,9 @@ class VLLM(TemplateLM):
@ray.remote
def run_inference_one_model(
model_args: dict,
sampling_params: List["SamplingParams"],
requests: List[List[int]],
lora_request: "LoRARequest",
sampling_params: list[SamplingParams],
requests: list[list[int]],
lora_request: LoRARequest,
):
llm = LLM(**model_args)
return llm.generate(
......@@ -487,8 +498,8 @@ class VLLM(TemplateLM):
return outputs
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
self, requests: list[Instance], disable_tqdm: bool = False
) -> list[float]:
adaptive_batch_size = None
if self.batch_size == "auto":
adaptive_batch_size = len(requests)
......@@ -503,7 +514,7 @@ class VLLM(TemplateLM):
disable=(disable_tqdm or (self.rank != 0)),
)
):
rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
rolling_token_windows: list[tuple[list[int], list[int]]] = list(
map(
make_disjoint_window,
get_rolling_token_windows(
......@@ -556,16 +567,14 @@ class VLLM(TemplateLM):
return loglikelihoods
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
self, requests: list[Instance], disable_tqdm: bool = False
) -> list[str]:
res = []
# batch tokenize contexts
context, all_gen_kwargs = zip(*(req.args for req in requests))
context_encoding: List[List[int]] = self.tok_encode(
context, add_special_tokens=self.add_bos_token
)
requests = [
context_encoding = self.tok_encode(context)
reqs = [
((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
]
......@@ -579,7 +588,7 @@ class VLLM(TemplateLM):
return -len(_requests[0][1]), _requests[0][0]
re_ords = Collator(
requests,
reqs,
_collate_gen,
group_by=None,
)
......@@ -588,7 +597,7 @@ class VLLM(TemplateLM):
)
pbar = tqdm(
total=len(requests),
total=len(reqs),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests",
)
......@@ -656,9 +665,9 @@ class VLLM(TemplateLM):
def _loglikelihood_tokens(
self,
requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
requests: list[tuple[tuple[str, str], list[int], list[int]]],
disable_tqdm: bool = False,
) -> List[Tuple[float, bool]]:
) -> list[tuple[float, bool]]:
res = []
def _collate(x):
......@@ -717,7 +726,7 @@ class VLLM(TemplateLM):
return re_ord.get_original(res)
@staticmethod
def _parse_logprobs(tokens: List, outputs, ctxlen: int) -> Tuple[float, bool]:
def _parse_logprobs(tokens: list, outputs, ctxlen: int) -> tuple[float, bool]:
"""Process logprobs and tokens.
:param tokens: list
......
This source diff could not be displayed because it is too large. You can view the blob instead.
# Titulm Bangla MMLU
This repository contains resources related to **Titulm Bangla MMLU**, a benchmark dataset designed for evaluating Bangla language models. The dataset is used for training, development, and comparative evaluation of language models in the Bangla language.
---
## Overview
**TituLLMs** is a family of Bangla large language models (LLMs) with comprehensive benchmarking designed to advance natural language processing for the Bangla language. The benchmark dataset `Titulm Bangla MMLU` covers multiple-choice questions across a diverse range of topics in Bangla.
This dataset is primarily used to train, validate, and evaluate Bangla language models and compare their performance with other existing models.
For more details, please refer to the original research paper:
[https://arxiv.org/abs/2502.11187](https://arxiv.org/abs/2502.11187)
---
## Dataset
The `Titulm Bangla MMLU` dataset can be found on Hugging Face:
[https://huggingface.co/datasets/hishab/titulm-bangla-mmlu](https://huggingface.co/datasets/hishab/titulm-bangla-mmlu)
This dataset was used as a benchmark in the development and evaluation of TituLLMs and related models.
---
## Usage
The dataset is intended for use within the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) repository to evaluate and compare the performance of Bangla language models.
---
## Note: The dataset can also be used to evaluate other models
### Other datasets like boolq, openbookqa ... soon to be added
## Citation
If you use this dataset or model, please cite the original paper:
```bibtex
@misc{nahin2025titullmsfamilybanglallms,
title={TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking},
author={Shahriar Kabir Nahin and Rabindra Nath Nandi and Sagor Sarker and Quazi Sarwar Muhtaseem and Md Kowsher and Apu Chandraw Shill and Md Ibrahim and Mehadi Hasan Menon and Tareq Al Muntasir and Firoj Alam},
year={2025},
eprint={2502.11187},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.11187},
}
task: bangla_mmlu
dataset_path: hishab/titulm-bangla-mmlu
dataset_name: all
description: "The following are multiple choice questions (with answers) about range of topics in Bangla"
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}} A. {{options[0]}} B. {{options[1]}} C. {{options[2]}} D. {{options[3]}} Answer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
# LongBench v2
### Paper
Title: `LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks`
Abstract: `This paper introduces LongBench v2, a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. LongBench v2 consists of 503 challenging multiple-choice questions, with contexts ranging from 8k to 2M words, across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding. To ensure the breadth and the practicality, we collect data from nearly 100 highly educated individuals with diverse professional backgrounds. We employ both automated and manual review processes to maintain high quality and difficulty, resulting in human experts achieving only 53.7% accuracy under a 15-minute time constraint. Our evaluation reveals that the best-performing model, when directly answers the questions, achieves only 50.1% accuracy. In contrast, the o1-preview model, which includes longer reasoning, achieves 57.7%, surpassing the human baseline by 4%. These results highlight the importance of enhanced reasoning ability and scaling inference-time compute to tackle the long-context challenges in LongBench v2.`
Homepage: `https://github.com/THUDM/LongBench`
### Citation
```
@article{bai2024longbench2,
title={LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks},
author={Yushi Bai and Shangqing Tu and Jiajie Zhang and Hao Peng and Xiaozhi Wang and Xin Lv and Shulin Cao and Jiazheng Xu and Lei Hou and Yuxiao Dong and Jie Tang and Juanzi Li},
journal={arXiv preprint arXiv:2412.15204},
year={2024}
}
```
### Groups, Tags, and Tasks
#### Groups
* `longbench2_single`: Single-document QA tasks requiring comprehension of documents across various domains (government, legal, literature, finance, academic, detective stories, and order of events)
* `longbench2_multi`: Multi-document QA tasks requiring information synthesis and reasoning across multiple documents in government, academic, finance, and news
* `longbench2_incontext`: Long in-context learning tasks including user guide comprehension, translation with examples, and many-shot learning scenarios
* `longbench2_history`: Long-dialogue history understanding tasks involving agent conversations and dialogue history comprehension
* `longbench2_structured`: Long structured data understanding tasks for graph and table data processing
#### Tags
* `longbench2`: Run the full benchmark with 503 multiple-choice questions (8k-2M words) testing understanding and reasoning on long-context tasks
#### Tasks
**Single-Document QA:**
* `longbench2_govt_single`: Question answering from single government documents
* `longbench2_legal_single`: Question answering from single legal documents
* `longbench2_lit_single`: Question answering from single literature/literary documents
* `longbench2_fin_single`: Question answering from single financial documents
* `longbench2_academic_single`: Question answering from single academic papers and research documents
* `longbench2_detective`: Question answering from detective stories requiring logical reasoning
* `longbench2_event_order`: Temporal reasoning tasks about event ordering in narratives
**Multi-Document QA:**
* `longbench2_govt_multi`: Question answering across multiple government documents
* `longbench2_academic_multi`: Question answering across multiple academic papers
* `longbench2_fin_multi`: Question answering across multiple financial documents
* `longbench2_news_multi`: Question answering across multiple news articles
**Long In-context Learning:**
* `longbench2_user_guide`: Comprehension and application of user guide instructions
* `longbench2_translate`: Translation tasks in new languages with long examples
* `longbench2_many_shot`: Few-shot learning with many examples in context
**Long-dialogue History Understanding:**
* `longbench2_agent_history`: Understanding and reasoning over extended agent conversation histories
* `longbench2_dialogue_history`: Understanding and reasoning over long dialogue exchanges
**Code Repository Understanding:**
* `longbench2_code`: Question answering on code repositories requiring codebase comprehension
**Long Structured Data Understanding:**
* `longbench2_graph`: Understanding and reasoning over graph-structured data
* `longbench2_table`: Understanding and reasoning over tabular data
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: longbench2
task:
- longbench2_history_tasks
- longbench2_incontext_tasks
- longbench2_multi_tasks
- longbench2_single_tasks
- longbench2_structured_tasks
- longbench2_code
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_history
group_alias: "Long-dialogue History Understanding"
task:
- longbench2_agent_history
- longbench2_dialogue_history
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_incontext
group_alias: "Long In-context Learning"
task:
- longbench2_user_guide
- longbench2_translate
- longbench2_many_shot
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_multi
group_alias: "Multi-Document QA"
task:
- longbench2_govt_multi
- longbench2_academic_multi
- longbench2_fin_multi
- longbench2_news_multi
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_single
group_alias: "Single-Document QA"
task:
- longbench2_govt_single
- longbench2_legal_single
- longbench2_lit_single
- longbench2_fin_single
- longbench2_event_order
- longbench2_academic_single
- longbench2_detective
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_structured
group_alias: "Long Structured Data Understanding"
task:
- longbench2_graph
- longbench2_table
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
dataset_path: recursal/longbench-v2
test_split: train
output_type: multiple_choice
doc_to_text: "Please read the following text and answer the question below.\n\n<text>\n{{context}}\n</text>\n\nWhat is the correct answer to this question: {{question.strip()}}\nChoices:\n(A) {{choices[0]}}\n(B) {{choices[1]}}\n(C) {{choices[2]}}\n(D) {{choices[3]}}\n\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: _longbench_common_yaml
tag:
- longbench2_tasks
- longbench2_multi_tasks
task: longbench2_academic_multi
dataset_name: academic_multi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment