Commit 79b31dad authored by Baber's avatar Baber
Browse files

Merge branch 'bos' into mrl

parents cbb8f5a4 7e5f909b
...@@ -44,11 +44,11 @@ jobs: ...@@ -44,11 +44,11 @@ jobs:
echo "One or more test file(s) has changed." echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}" echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9 - name: Set up Python 3.10
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: '3.10'
cache: 'pip' cache: 'pip'
cache-dependency-path: pyproject.toml cache-dependency-path: pyproject.toml
- name: Install dependencies - name: Install dependencies
......
...@@ -22,10 +22,10 @@ jobs: ...@@ -22,10 +22,10 @@ jobs:
steps: steps:
- name: Checkout Code - name: Checkout Code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Set up Python 3.9 - name: Set up Python 3.10
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: '3.10'
cache: pip cache: pip
cache-dependency-path: pyproject.toml cache-dependency-path: pyproject.toml
- name: Pre-Commit - name: Pre-Commit
...@@ -39,7 +39,7 @@ jobs: ...@@ -39,7 +39,7 @@ jobs:
strategy: strategy:
fail-fast: true fail-fast: true
matrix: matrix:
python-version: ["3.9", "3.10", "3.11"] python-version: ["3.10", "3.11", "3.12"]
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
- name: Checkout Code - name: Checkout Code
......
from __future__ import annotations
import abc import abc
import hashlib import hashlib
import json import json
import logging import logging
import os import os
from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, TypeVar
from tqdm import tqdm from tqdm import tqdm
...@@ -31,7 +34,7 @@ class LM(abc.ABC): ...@@ -31,7 +34,7 @@ class LM(abc.ABC):
# set rank and world size to a single process, by default. # set rank and world size to a single process, by default.
self._rank = 0 self._rank = 0
self._world_size = 1 self._world_size = 1
self.cache_hook: "CacheHook" = CacheHook(None) self.cache_hook: CacheHook = CacheHook(None)
@abc.abstractmethod @abc.abstractmethod
def loglikelihood(self, requests) -> list[tuple[float, bool]]: def loglikelihood(self, requests) -> list[tuple[float, bool]]:
...@@ -137,7 +140,7 @@ class LM(abc.ABC): ...@@ -137,7 +140,7 @@ class LM(abc.ABC):
@classmethod @classmethod
def create_from_arg_string( def create_from_arg_string(
cls: Type[T], arg_string: str, additional_config: Optional[dict] = None cls: type[T], arg_string: str, additional_config: dict | None = None
) -> T: ) -> T:
""" """
Creates an instance of the LM class using the given argument string and additional config. Creates an instance of the LM class using the given argument string and additional config.
...@@ -156,7 +159,7 @@ class LM(abc.ABC): ...@@ -156,7 +159,7 @@ class LM(abc.ABC):
@classmethod @classmethod
def create_from_arg_obj( def create_from_arg_obj(
cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None cls: type[T], arg_dict: dict, additional_config: dict | None = None
) -> T: ) -> T:
""" """
Creates an instance of the LM class using the given arg_obj Creates an instance of the LM class using the given arg_obj
...@@ -199,7 +202,7 @@ class LM(abc.ABC): ...@@ -199,7 +202,7 @@ class LM(abc.ABC):
"To use this model with chat templates, please implement the 'tokenizer_name' property." "To use this model with chat templates, please implement the 'tokenizer_name' property."
) )
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]: def chat_template(self, chat_template: bool | str = False) -> str | None:
"""Returns the chat template structure for user/assistant messages if a template is provided. """Returns the chat template structure for user/assistant messages if a template is provided.
This method is intended to be overridden in a subclass to define a specific chat template format. This method is intended to be overridden in a subclass to define a specific chat template format.
For models that do not support chat templates, this method returns None by default. For models that do not support chat templates, this method returns None by default.
...@@ -207,7 +210,7 @@ class LM(abc.ABC): ...@@ -207,7 +210,7 @@ class LM(abc.ABC):
return "" return ""
def set_cache_hook(self, cache_hook: "CacheHook") -> None: def set_cache_hook(self, cache_hook: CacheHook) -> None:
self.cache_hook = cache_hook self.cache_hook = cache_hook
...@@ -218,9 +221,9 @@ def hash_args(attr: str, args: Iterable[Any]) -> str: ...@@ -218,9 +221,9 @@ def hash_args(attr: str, args: Iterable[Any]) -> str:
class CacheHook: class CacheHook:
def __init__(self, cachinglm: Optional["CachingLM"]) -> None: def __init__(self, cachinglm: CachingLM | None) -> None:
if cachinglm is None: if cachinglm is None:
self.dbdict: Optional["SqliteDict"] = None self.dbdict: SqliteDict | None = None
return return
self.dbdict = cachinglm.dbdict self.dbdict = cachinglm.dbdict
...@@ -258,7 +261,7 @@ class CachingLM: ...@@ -258,7 +261,7 @@ class CachingLM:
eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM") eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
return lm_attr return lm_attr
def _fn(requests: list["Instance"]) -> list["Instance"]: def _fn(requests: list[Instance]) -> list[Instance]:
res = [] res = []
remaining_reqs = [] remaining_reqs = []
warned = False warned = False
...@@ -313,7 +316,7 @@ class CachingLM: ...@@ -313,7 +316,7 @@ class CachingLM:
return _fn return _fn
def get_cache_hook(self) -> "CacheHook": def get_cache_hook(self) -> CacheHook:
return CacheHook(self) return CacheHook(self)
...@@ -324,10 +327,11 @@ class TemplateLM(LM): ...@@ -324,10 +327,11 @@ class TemplateLM(LM):
""" """
tokenizer = None tokenizer = None
backend = "causal"
@property @property
@abc.abstractmethod @abc.abstractmethod
def eot_token_id(self): def eot_token_id(self) -> int:
pass pass
@property @property
...@@ -336,7 +340,9 @@ class TemplateLM(LM): ...@@ -336,7 +340,9 @@ class TemplateLM(LM):
return self.eot_token_id return self.eot_token_id
@abc.abstractmethod @abc.abstractmethod
def tok_encode(self, string: str, **kwargs) -> list[int]: def tok_encode(
self, string: str, add_special_tokens: bool | None = None, **kwargs
) -> list[int]:
""" """
Tokenize a string using the model's tokenizer and return a list of token IDs. Tokenize a string using the model's tokenizer and return a list of token IDs.
""" """
...@@ -344,45 +350,100 @@ class TemplateLM(LM): ...@@ -344,45 +350,100 @@ class TemplateLM(LM):
@abc.abstractmethod @abc.abstractmethod
def _loglikelihood_tokens( def _loglikelihood_tokens(
self, requests: list["Instance"], **kwargs self, requests: list[Instance], **kwargs
) -> list[tuple[float, bool]]: ) -> list[tuple[float, bool]]:
pass pass
def _encode_pair( def _encode_pair(
self, context: str, continuation: str self, context: str, continuation: str
) -> tuple[list[int], list[int]]: ) -> tuple[list[int], list[int]]:
import transformers """
Encode a context-continuation pair into separate token ID lists.
This method handles the tokenization of context and continuation strings while
preserving proper boundary handling. Trailing spaces in the context are moved
to the beginning of the continuation to ensure correct tokenization at the
word boundary.
For Seq2Seq models (encoder-decoder), context and continuation are encoded
separately. For other model types (decoder-only), the full sequence is encoded
together to ensure proper tokenization, then split at the context boundary.
:param context: str
The context string. Can be empty (will be handled by the caller).
:param continuation: str
The continuation string to be scored.
:return: tuple[list[int], list[int]]
A tuple of (context_enc, continuation_enc) where:
- context_enc: Token IDs for the context
- continuation_enc: Token IDs for the continuation
Note:
This method does NOT handle empty context. The caller should
handle empty context (see loglikelihood method).
"""
assert context, "Context cannot be empty!"
n_spaces = len(context) - len(context.rstrip()) n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0: if n_spaces > 0:
continuation = context[-n_spaces:] + continuation continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces] context = context[:-n_spaces]
model_class = getattr(self, "AUTO_MODEL_CLASS", None) if self.backend == "causal":
if model_class == transformers.AutoModelForSeq2SeqLM:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
else:
whole_enc = self.tok_encode(context + continuation) whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context) context_enc = self.tok_encode(context)
context_enc_len = len(context_enc) context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:] continuation_enc = whole_enc[context_enc_len:]
else:
# for SEQ2SEQ case we need to encode separately
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
return context_enc, continuation_enc return context_enc, continuation_enc
def loglikelihood( def loglikelihood(
self, requests: list["Instance"], disable_tqdm: bool = False self, requests: list[Instance], disable_tqdm: bool = False
) -> list[tuple[float, bool]]: ) -> list[tuple[float, bool]]:
"""
Compute log-likelihood of generating continuations from contexts.
This is the concrete implementation for TemplateLM and its subclasses.
It tokenizes context-continuation pairs and delegates scoring to
_loglikelihood_tokens.
**IMPORTANT**: This method is expected to handle empty context strings.
When context is empty (""), it uses the model's prefix_token_id (typically
BOS or EOS token) as context. If the continuation already starts with the
prefix token, it reuses that token as context instead of duplicating it.
:param requests: list[Instance]
List of Instance objects with property `args` returning (context, continuation) tuples.
:param disable_tqdm: bool
Whether to disable the progress bar in _loglikelihood_tokens.
:return: list[tuple[float, bool]]
List of (log_prob, is_greedy) tuples for each request.
Implementation details:
- Empty context: Uses prefix_token_id (BOS/EOS) as context
- Non-empty context: Uses _encode_pair for proper tokenization
- Avoids token duplication when continuation starts with prefix_token_id
"""
new_reqs = [] new_reqs = []
for context, continuation in [req.args for req in requests]: for context, continuation in [req.args for req in requests]:
if context == "": if context == "":
# BOS or EOS as context continuation_enc = self.tok_encode(
continuation, add_special_tokens=False
)
# BOS or EOS as context: handle when context is empty -> (context + continuation) -> (BOS + continuation
context_enc, continuation_enc = ( context_enc, continuation_enc = (
[self.prefix_token_id], ([self.prefix_token_id], continuation_enc)
self.tok_encode(continuation), if self.prefix_token_id != continuation_enc[0]
else (continuation_enc[:1], continuation_enc[1:])
) )
# BOS or EOS as context
else: else:
context_enc, continuation_enc = self._encode_pair(context, continuation) context_enc, continuation_enc = self._encode_pair(context, continuation)
...@@ -400,7 +461,7 @@ class TemplateLM(LM): ...@@ -400,7 +461,7 @@ class TemplateLM(LM):
def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
pass pass
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]: def chat_template(self, chat_template: bool | str = False) -> str | None:
""" """
Set and get the appropriate chat template for the model. Set and get the appropriate chat template for the model.
This method sets the tokenizer's chat_template and returns the template string for reproducibility. This method sets the tokenizer's chat_template and returns the template string for reproducibility.
......
...@@ -114,7 +114,7 @@ class TemplateAPI(TemplateLM): ...@@ -114,7 +114,7 @@ class TemplateAPI(TemplateLM):
# however the requests can be sent as a string if the API doesn't support token inputs. # however the requests can be sent as a string if the API doesn't support token inputs.
# use tokenized_requests=False # use tokenized_requests=False
tokenizer_backend: Optional[ tokenizer_backend: Optional[
Literal["tiktoken", "huggingface", "None", "none"] Literal["tiktoken", "huggingface", "remote", "None", "none"]
] = "huggingface", ] = "huggingface",
truncate: bool = False, truncate: bool = False,
# number of concurrent requests. More useful if not batching # number of concurrent requests. More useful if not batching
...@@ -132,6 +132,8 @@ class TemplateAPI(TemplateLM): ...@@ -132,6 +132,8 @@ class TemplateAPI(TemplateLM):
revision: Optional[str] = "main", revision: Optional[str] = "main",
use_fast_tokenizer: bool = True, use_fast_tokenizer: bool = True,
verify_certificate: bool = True, verify_certificate: bool = True,
ca_cert_path: Optional[str] = None,
auth_token: Optional[str] = None,
eos_string: str = None, eos_string: str = None,
# timeout in seconds # timeout in seconds
timeout: int = 300, timeout: int = 300,
...@@ -182,6 +184,8 @@ class TemplateAPI(TemplateLM): ...@@ -182,6 +184,8 @@ class TemplateAPI(TemplateLM):
self.tokenized_requests = tokenized_requests self.tokenized_requests = tokenized_requests
self.max_retries = int(max_retries) self.max_retries = int(max_retries)
self.verify_certificate = verify_certificate self.verify_certificate = verify_certificate
self.ca_cert_path = ca_cert_path
self.auth_token = auth_token
self._eos_string = eos_string self._eos_string = eos_string
self.timeout = int(timeout) self.timeout = int(timeout)
self.max_images = int(max_images) self.max_images = int(max_images)
...@@ -218,6 +222,21 @@ class TemplateAPI(TemplateLM): ...@@ -218,6 +222,21 @@ class TemplateAPI(TemplateLM):
f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. " f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
"Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken." "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
) )
elif self.tokenizer_backend == "remote":
from lm_eval.utils import RemoteTokenizer
if not self.base_url:
raise ValueError(
"base_url is required for remote tokenizer backend"
)
self.tokenizer = RemoteTokenizer(
self.base_url,
self.timeout,
self.verify_certificate,
self.ca_cert_path,
self.auth_token,
)
eval_logger.info(f"Using remote tokenizer from {self.base_url}")
else: else:
import transformers import transformers
...@@ -310,7 +329,7 @@ class TemplateAPI(TemplateLM): ...@@ -310,7 +329,7 @@ class TemplateAPI(TemplateLM):
def apply_chat_template( def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
) -> Union[str, JsonChatStr]: ) -> Union[str, JsonChatStr, List[Dict]]:
"""Applies a chat template to a list of chat history between user and model.""" """Applies a chat template to a list of chat history between user and model."""
if self.tokenizer_backend == "huggingface" and self.tokenized_requests: if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
return self.tokenizer.apply_chat_template( return self.tokenizer.apply_chat_template(
...@@ -319,6 +338,8 @@ class TemplateAPI(TemplateLM): ...@@ -319,6 +338,8 @@ class TemplateAPI(TemplateLM):
add_generation_prompt=add_generation_prompt, add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt, continue_final_message=not add_generation_prompt,
) )
elif self.tokenizer_backend == "remote" and self.tokenized_requests:
return chat_history
else: else:
# bit of a hack. We'll load back before sending to the API # bit of a hack. We'll load back before sending to the API
return JsonChatStr( return JsonChatStr(
...@@ -337,6 +358,8 @@ class TemplateAPI(TemplateLM): ...@@ -337,6 +358,8 @@ class TemplateAPI(TemplateLM):
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
elif self.tokenizer_backend == "tiktoken": elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.eot_token return self.tokenizer.eot_token
elif self.tokenizer_backend == "remote":
return self.tokenizer.eos_token_id
@cached_property @cached_property
def eos_string(self) -> Optional[str]: def eos_string(self) -> Optional[str]:
...@@ -347,6 +370,8 @@ class TemplateAPI(TemplateLM): ...@@ -347,6 +370,8 @@ class TemplateAPI(TemplateLM):
return self.tokenizer.eos_token return self.tokenizer.eos_token
elif self.tokenizer_backend == "tiktoken": elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.decode([self.tokenizer.eot_token]) return self.tokenizer.decode([self.tokenizer.eot_token])
elif self.tokenizer_backend == "remote":
return self.tokenizer.eos_token
else: else:
eval_logger.warning( eval_logger.warning(
"Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args." "Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args."
...@@ -364,6 +389,8 @@ class TemplateAPI(TemplateLM): ...@@ -364,6 +389,8 @@ class TemplateAPI(TemplateLM):
if self.tokenizer.bos_token_id is not None: if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
elif self.tokenizer_backend == "remote":
return self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
else: else:
return self.tokenizer.eot_token return self.tokenizer.eot_token
...@@ -396,7 +423,19 @@ class TemplateAPI(TemplateLM): ...@@ -396,7 +423,19 @@ class TemplateAPI(TemplateLM):
encoding = encoding[-left_truncate_len:] encoding = encoding[-left_truncate_len:]
return encoding return encoding
elif self.tokenizer_backend == "remote":
if isinstance(string, str):
encoding = self.tokenizer.encode(string)
else:
encoding = [self.tokenizer.encode(s) for s in string]
if left_truncate_len:
if isinstance(string, str):
encoding = encoding[-left_truncate_len:]
else:
encoding = [enc[-left_truncate_len:] for enc in encoding]
return encoding
else: else:
try: try:
encoding = self.tokenizer.encode(string) encoding = self.tokenizer.encode(string)
...@@ -409,6 +448,8 @@ class TemplateAPI(TemplateLM): ...@@ -409,6 +448,8 @@ class TemplateAPI(TemplateLM):
return self.tokenizer.batch_decode(tokens) return self.tokenizer.batch_decode(tokens)
elif self.tokenizer_backend == "tiktoken": elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.decode_batch(tokens) return self.tokenizer.decode_batch(tokens)
elif self.tokenizer_backend == "remote":
return self.tokenizer.batch_decode(tokens)
def model_call( def model_call(
self, self,
......
...@@ -32,6 +32,7 @@ from lm_eval.api.model import TemplateLM ...@@ -32,6 +32,7 @@ from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import ( from lm_eval.models.utils import (
Collator, Collator,
bos_already_added,
clear_torch_cache, clear_torch_cache,
configure_pad_token, configure_pad_token,
get_dtype, get_dtype,
...@@ -84,7 +85,7 @@ class HFLM(TemplateLM): ...@@ -84,7 +85,7 @@ class HFLM(TemplateLM):
max_batch_size: int | None = 64, max_batch_size: int | None = 64,
trust_remote_code: bool | None = False, trust_remote_code: bool | None = False,
use_fast_tokenizer: bool | None = True, use_fast_tokenizer: bool | None = True,
add_bos_token: bool | None = False, add_bos_token: bool | None = None,
prefix_token_id: int | None = None, prefix_token_id: int | None = None,
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`. # only used if `parallelize=True`.
...@@ -258,11 +259,6 @@ class HFLM(TemplateLM): ...@@ -258,11 +259,6 @@ class HFLM(TemplateLM):
) )
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
if "gemma" in getattr(self.config, "model_type", ""):
self.add_bos_token = True
eval_logger.info(
f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
)
self._max_length = max_length self._max_length = max_length
self.pretrained = pretrained self.pretrained = pretrained
...@@ -744,7 +740,7 @@ class HFLM(TemplateLM): ...@@ -744,7 +740,7 @@ class HFLM(TemplateLM):
trust_remote_code: bool | None = False, trust_remote_code: bool | None = False,
use_fast_tokenizer: bool | None = True, use_fast_tokenizer: bool | None = True,
gguf_file: str | None = None, gguf_file: str | None = None,
add_bos_token: bool | None = False, add_bos_token: bool | None = None,
subfolder: str | None = "", subfolder: str | None = "",
) -> None: ) -> None:
"""Helper method during initialization. """Helper method during initialization.
...@@ -763,8 +759,8 @@ class HFLM(TemplateLM): ...@@ -763,8 +759,8 @@ class HFLM(TemplateLM):
else: else:
kwargs["use_fast"] = use_fast_tokenizer kwargs["use_fast"] = use_fast_tokenizer
if add_bos_token: if add_bos_token is not None:
kwargs["add_bos_token"] = True kwargs["add_bos_token"] = add_bos_token
if subfolder: if subfolder:
kwargs["subfolder"] = subfolder kwargs["subfolder"] = subfolder
...@@ -858,23 +854,20 @@ class HFLM(TemplateLM): ...@@ -858,23 +854,20 @@ class HFLM(TemplateLM):
def tok_encode( def tok_encode(
self, self,
string: str, string: str,
left_truncate_len: int | None = None,
add_special_tokens: bool | None = None, add_special_tokens: bool | None = None,
left_truncate_len: int | None = None,
**kwargs,
) -> list[int]: ) -> list[int]:
""" """
# default for None - empty dict, use predefined tokenizer param # default for None - empty dict, use predefined tokenizer param
# used for all models except for CausalLM or predefined value # used for all models except for CausalLM or predefined value
special_tokens_kwargs = {}
# by default for CausalLM - false or self.add_bos_token is set special_tokens_kwargs = (
if add_special_tokens is None: {"add_special_tokens": add_special_tokens}
if self.backend == "causal": if (isinstance(add_special_tokens, bool))
special_tokens_kwargs = { else {"add_special_tokens": self.add_bos_token}
"add_special_tokens": False or self.add_bos_token if self.add_bos_token is not None
} else {}
# otherwise the method explicitly defines the value )
else:
special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
encoding = self.tokenizer.encode(string, **special_tokens_kwargs) encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
...@@ -897,7 +890,14 @@ class HFLM(TemplateLM): ...@@ -897,7 +890,14 @@ class HFLM(TemplateLM):
add_special_tokens = {} add_special_tokens = {}
if self.backend == "causal": if self.backend == "causal":
add_special_tokens = {"add_special_tokens": False or self.add_bos_token} if bos_already_added(
strings[0], getattr(self.tokenizer, "bos_token", None)
):
add_special_tokens = {"add_special_tokens": False}
elif self.add_bos_token is not None:
add_special_tokens = {"add_special_tokens": self.add_bos_token}
else:
add_special_tokens = {}
encoding = self.tokenizer( encoding = self.tokenizer(
strings, strings,
...@@ -971,7 +971,7 @@ class HFLM(TemplateLM): ...@@ -971,7 +971,7 @@ class HFLM(TemplateLM):
context, context,
max_length: int, max_length: int,
stop: list[str], stop: list[str],
**generation_kwargs: dict[str, Any], **generation_kwargs,
) -> torch.Tensor: ) -> torch.Tensor:
# temperature = 0.0 if not set # temperature = 0.0 if not set
# if do_sample is false and temp==0.0: # if do_sample is false and temp==0.0:
......
...@@ -16,12 +16,46 @@ eval_logger = logging.getLogger(__name__) ...@@ -16,12 +16,46 @@ eval_logger = logging.getLogger(__name__)
class LocalCompletionsAPI(TemplateAPI): class LocalCompletionsAPI(TemplateAPI):
def __init__( def __init__(
self, self,
base_url: str = None, base_url=None,
tokenizer_backend: str = "huggingface", tokenizer_backend="auto",
verify_certificate=True,
ca_cert_path=None,
auth_token=None,
**kwargs, **kwargs,
): ):
# Auto-detect tokenizer backend
if tokenizer_backend == "auto":
if base_url:
from lm_eval.utils import check_remote_tokenizer_support
if check_remote_tokenizer_support(
base_url,
verify_certificate=verify_certificate,
ca_cert_path=ca_cert_path,
auth_token=auth_token,
):
eval_logger.info(
"Auto-detected remote tokenizer support. Using remote tokenizer backend."
)
tokenizer_backend = "remote"
else:
eval_logger.info(
"Remote tokenizer not supported. Using huggingface tokenizer backend."
)
tokenizer_backend = "huggingface"
else:
eval_logger.warning(
"No base_url provided. Using huggingface tokenizer backend."
)
tokenizer_backend = "huggingface"
super().__init__( super().__init__(
base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs base_url=base_url,
tokenizer_backend=tokenizer_backend,
verify_certificate=verify_certificate,
ca_cert_path=ca_cert_path,
auth_token=auth_token,
**kwargs,
) )
def _create_payload( def _create_payload(
...@@ -98,20 +132,28 @@ class LocalCompletionsAPI(TemplateAPI): ...@@ -98,20 +132,28 @@ class LocalCompletionsAPI(TemplateAPI):
@register_model("local-chat-completions") @register_model("local-chat-completions")
class LocalChatCompletion(LocalCompletionsAPI): class LocalChatCompletion(LocalCompletionsAPI):
"""
Minimal chat-completions wrapper.
- Only accepts messages as list[dict].
- No tokenization or template logic.
- Use with --apply_chat_template or ensure upstream formats messages correctly.
"""
def __init__( def __init__(
self, self,
base_url: str = None, base_url=None,
tokenizer_backend: str = None, verify_certificate=True,
tokenized_requests: bool = False, ca_cert_path=None,
auth_token=None,
**kwargs, **kwargs,
): ):
eval_logger.warning(
"chat-completions endpoint requires the `--apply_chat_template` flag."
)
super().__init__( super().__init__(
base_url=base_url, base_url=base_url,
tokenizer_backend=tokenizer_backend, tokenizer_backend=None,
tokenized_requests=tokenized_requests, tokenized_requests=None,
verify_certificate=verify_certificate,
ca_cert_path=ca_cert_path,
auth_token=auth_token,
**kwargs, **kwargs,
) )
if self._batch_size > 1: if self._batch_size > 1:
...@@ -129,9 +171,13 @@ class LocalChatCompletion(LocalCompletionsAPI): ...@@ -129,9 +171,13 @@ class LocalChatCompletion(LocalCompletionsAPI):
eos=None, eos=None,
**kwargs, **kwargs,
) -> dict: ) -> dict:
assert type(messages) is not str, ( assert isinstance(messages, list) and all(
"chat-completions require the --apply_chat_template flag." isinstance(m, dict) for m in messages
), (
"LocalChatCompletion expects messages as list[dict]. "
"If you see this error, ensure --apply_chat_template is set or upstream code formats messages correctly."
) )
gen_kwargs = gen_kwargs or {}
gen_kwargs.pop("do_sample", False) gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs: if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens") max_tokens = gen_kwargs.pop("max_tokens")
......
...@@ -150,7 +150,7 @@ class Grouper: ...@@ -150,7 +150,7 @@ class Grouper:
def pad_and_concat( def pad_and_concat(
max_length: int, max_length: int,
tensors: List[torch.Tensor], tensors: list[torch.Tensor],
padding_side: Literal["right", "left"] = "right", padding_side: Literal["right", "left"] = "right",
): ):
""" """
...@@ -881,3 +881,7 @@ def postprocess_generated_text( ...@@ -881,3 +881,7 @@ def postprocess_generated_text(
generation = generation.split(think_end_token)[-1].lstrip() generation = generation.split(think_end_token)[-1].lstrip()
return generation return generation
def bos_already_added(sequence: str, bos_string: Optional[str]):
return sequence.startswith(bos_string) if bos_string is not None else False
from __future__ import annotations
import copy import copy
import gc import gc
import logging import logging
...@@ -7,7 +9,7 @@ from importlib.util import find_spec ...@@ -7,7 +9,7 @@ from importlib.util import find_spec
from multiprocessing import Process, Queue from multiprocessing import Process, Queue
from queue import Empty from queue import Empty
from time import sleep from time import sleep
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union from typing import TYPE_CHECKING, Literal
import jinja2 import jinja2
from more_itertools import distribute from more_itertools import distribute
...@@ -19,6 +21,7 @@ from lm_eval.api.model import TemplateLM ...@@ -19,6 +21,7 @@ from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import ( from lm_eval.models.utils import (
Collator, Collator,
bos_already_added,
configure_pad_token, configure_pad_token,
handle_stop_sequences, handle_stop_sequences,
postprocess_generated_text, postprocess_generated_text,
...@@ -50,10 +53,10 @@ eval_logger = logging.getLogger(__name__) ...@@ -50,10 +53,10 @@ eval_logger = logging.getLogger(__name__)
def _vllm_mp_worker( def _vllm_mp_worker(
model_args: dict, model_args: dict,
sampling_params: list["SamplingParams"], sampling_params: list[SamplingParams],
requests: list[list[int]], requests: list[list[int]],
lora_request: "LoRARequest", lora_request: LoRARequest,
result_queue: "Queue", result_queue: Queue,
dp_size: int, dp_size: int,
local_dp_rank: int, local_dp_rank: int,
dp_master_port: int, dp_master_port: int,
...@@ -113,18 +116,18 @@ class VLLM(TemplateLM): ...@@ -113,18 +116,18 @@ class VLLM(TemplateLM):
self, self,
pretrained: str, pretrained: str,
dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto", dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
revision: Optional[str] = None, revision: str | None = None,
trust_remote_code: Optional[bool] = False, trust_remote_code: bool | None = False,
tokenizer: Optional[str] = None, tokenizer: str | None = None,
tokenizer_mode: Literal["auto", "slow"] = "auto", tokenizer_mode: Literal["auto", "slow"] = "auto",
tokenizer_revision: Optional[str] = None, tokenizer_revision: str | None = None,
add_bos_token: Optional[bool] = False, add_bos_token: bool | None = False,
prefix_token_id: Optional[int] = None, prefix_token_id: int | None = None,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
quantization: Optional[str] = None, quantization: str | None = None,
max_gen_toks: int = 256, max_gen_toks: int = 256,
swap_space: int = 4, swap_space: int = 4,
batch_size: Union[str, int] = 1, batch_size: str | int = 1,
max_batch_size=None, max_batch_size=None,
max_length: int = None, max_length: int = None,
max_model_len: int = None, max_model_len: int = None,
...@@ -134,9 +137,9 @@ class VLLM(TemplateLM): ...@@ -134,9 +137,9 @@ class VLLM(TemplateLM):
lora_local_path: str = None, lora_local_path: str = None,
# VLLM: enable thinking tags in the prompt. # VLLM: enable thinking tags in the prompt.
enable_thinking: bool = True, enable_thinking: bool = True,
chat_template_args: Optional[dict] = None, chat_template_args: dict | None = None,
# End marker for thinking tags - splits to get response after this token (if provided). # End marker for thinking tags - splits to get response after this token (if provided).
think_end_token: Optional[str] = None, think_end_token: str | None = None,
max_lora_rank: int = 16, max_lora_rank: int = 16,
**kwargs, **kwargs,
): ):
...@@ -195,11 +198,7 @@ class VLLM(TemplateLM): ...@@ -195,11 +198,7 @@ class VLLM(TemplateLM):
self.batch_size = "auto" self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.") eval_logger.info("Manual batching is not compatible with data parallelism.")
if "gemma" in pretrained.lower(): self.add_bos_token = add_bos_token
add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
)
from transformers import AutoConfig from transformers import AutoConfig
...@@ -211,14 +210,17 @@ class VLLM(TemplateLM): ...@@ -211,14 +210,17 @@ class VLLM(TemplateLM):
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
revision=tokenizer_revision, revision=tokenizer_revision,
add_bos_token=add_bos_token, **(
{"add_bos_token": self.add_bos_token}
if self.add_bos_token is not None
else {}
),
) )
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config) self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
self.chat_template_args = chat_template_args or {} self.chat_template_args = chat_template_args or {}
self.enable_thinking = self.chat_template_args.pop( self.enable_thinking = self.chat_template_args.pop(
"enable_thinking", enable_thinking "enable_thinking", enable_thinking
) )
self.add_bos_token = add_bos_token
if parse_version(version("vllm")) >= parse_version("0.8.3"): if parse_version(version("vllm")) >= parse_version("0.8.3"):
kwargs_resolve_hf_chat_template = { kwargs_resolve_hf_chat_template = {
...@@ -265,7 +267,7 @@ class VLLM(TemplateLM): ...@@ -265,7 +267,7 @@ class VLLM(TemplateLM):
self.lora_request = None self.lora_request = None
@property @property
def eot_token_id(self): def eot_token_id(self) -> int | None:
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
...@@ -300,7 +302,7 @@ class VLLM(TemplateLM): ...@@ -300,7 +302,7 @@ class VLLM(TemplateLM):
return self._max_gen_toks return self._max_gen_toks
def apply_chat_template( def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
) -> str: ) -> str:
""" """
Method to apply a chat template to a list of chat history between user and model. Method to apply a chat template to a list of chat history between user and model.
...@@ -337,18 +339,27 @@ class VLLM(TemplateLM): ...@@ -337,18 +339,27 @@ class VLLM(TemplateLM):
def tok_encode( def tok_encode(
self, self,
string: Union[str, List[str]], string: str | list[str],
left_truncate_len: int = None, left_truncate_len: int | None = None,
add_special_tokens: bool = False, add_special_tokens: bool | None = None,
truncation: bool = False, truncation: bool = False,
) -> Union[List[int], List[List[int]]]: ) -> list[int] | list[list[int]]:
if not add_special_tokens: add_special_kwargs = (
add_special_tokens = False or self.add_bos_token {"add_special_tokens": add_special_tokens or self.add_bos_token}
encoding: Union[List[List[int]], List[int]] = self.tokenizer( if (add_special_tokens is not None or self.add_bos_token is not None)
else {}
)
# handle chat template
if bos_already_added(
string[0] if isinstance(string, list) else string, self.tokenizer.bos_token
):
add_special_kwargs = {"add_special_tokens": False}
encoding: list[list[int]] | list[int] = self.tokenizer(
string, string,
add_special_tokens=add_special_tokens,
truncation=truncation, truncation=truncation,
return_attention_mask=False, return_attention_mask=False,
**add_special_kwargs,
).input_ids ).input_ids
# left-truncate the encoded context to be at most `left_truncate_len` tokens long # left-truncate the encoded context to be at most `left_truncate_len` tokens long
...@@ -362,15 +373,15 @@ class VLLM(TemplateLM): ...@@ -362,15 +373,15 @@ class VLLM(TemplateLM):
def _model_generate( def _model_generate(
self, self,
requests: List[List[int]] = None, requests: list[list[int]],
generate: bool = False, generate: bool = False,
sampling_params: Union[List["SamplingParams"], "SamplingParams", None] = None, sampling_params: list[SamplingParams] | SamplingParams | None = None,
): ):
if not generate or sampling_params is None: if not generate or sampling_params is None:
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
) )
if not isinstance(sampling_params, List): if not isinstance(sampling_params, list):
sampling_params = [sampling_params] * len(requests) sampling_params = [sampling_params] * len(requests)
if self.data_parallel_size > 1 and not self.V1: if self.data_parallel_size > 1 and not self.V1:
# vLLM hangs if resources are set in ray.remote # vLLM hangs if resources are set in ray.remote
...@@ -379,9 +390,9 @@ class VLLM(TemplateLM): ...@@ -379,9 +390,9 @@ class VLLM(TemplateLM):
@ray.remote @ray.remote
def run_inference_one_model( def run_inference_one_model(
model_args: dict, model_args: dict,
sampling_params: List["SamplingParams"], sampling_params: list[SamplingParams],
requests: List[List[int]], requests: list[list[int]],
lora_request: "LoRARequest", lora_request: LoRARequest,
): ):
llm = LLM(**model_args) llm = LLM(**model_args)
return llm.generate( return llm.generate(
...@@ -487,8 +498,8 @@ class VLLM(TemplateLM): ...@@ -487,8 +498,8 @@ class VLLM(TemplateLM):
return outputs return outputs
def loglikelihood_rolling( def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False self, requests: list[Instance], disable_tqdm: bool = False
) -> List[float]: ) -> list[float]:
adaptive_batch_size = None adaptive_batch_size = None
if self.batch_size == "auto": if self.batch_size == "auto":
adaptive_batch_size = len(requests) adaptive_batch_size = len(requests)
...@@ -503,7 +514,7 @@ class VLLM(TemplateLM): ...@@ -503,7 +514,7 @@ class VLLM(TemplateLM):
disable=(disable_tqdm or (self.rank != 0)), disable=(disable_tqdm or (self.rank != 0)),
) )
): ):
rolling_token_windows: List[Tuple[List[int], List[int]]] = list( rolling_token_windows: list[tuple[list[int], list[int]]] = list(
map( map(
make_disjoint_window, make_disjoint_window,
get_rolling_token_windows( get_rolling_token_windows(
...@@ -556,16 +567,14 @@ class VLLM(TemplateLM): ...@@ -556,16 +567,14 @@ class VLLM(TemplateLM):
return loglikelihoods return loglikelihoods
def generate_until( def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False self, requests: list[Instance], disable_tqdm: bool = False
) -> List[str]: ) -> list[str]:
res = [] res = []
# batch tokenize contexts # batch tokenize contexts
context, all_gen_kwargs = zip(*(req.args for req in requests)) context, all_gen_kwargs = zip(*(req.args for req in requests))
context_encoding: List[List[int]] = self.tok_encode( context_encoding = self.tok_encode(context)
context, add_special_tokens=self.add_bos_token reqs = [
)
requests = [
((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs) ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
] ]
...@@ -579,7 +588,7 @@ class VLLM(TemplateLM): ...@@ -579,7 +588,7 @@ class VLLM(TemplateLM):
return -len(_requests[0][1]), _requests[0][0] return -len(_requests[0][1]), _requests[0][0]
re_ords = Collator( re_ords = Collator(
requests, reqs,
_collate_gen, _collate_gen,
group_by=None, group_by=None,
) )
...@@ -588,7 +597,7 @@ class VLLM(TemplateLM): ...@@ -588,7 +597,7 @@ class VLLM(TemplateLM):
) )
pbar = tqdm( pbar = tqdm(
total=len(requests), total=len(reqs),
disable=(disable_tqdm or (self.rank != 0)), disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests", desc="Running generate_until requests",
) )
...@@ -656,9 +665,9 @@ class VLLM(TemplateLM): ...@@ -656,9 +665,9 @@ class VLLM(TemplateLM):
def _loglikelihood_tokens( def _loglikelihood_tokens(
self, self,
requests: List[Tuple[Tuple[str, str], List[int], List[int]]], requests: list[tuple[tuple[str, str], list[int], list[int]]],
disable_tqdm: bool = False, disable_tqdm: bool = False,
) -> List[Tuple[float, bool]]: ) -> list[tuple[float, bool]]:
res = [] res = []
def _collate(x): def _collate(x):
...@@ -717,7 +726,7 @@ class VLLM(TemplateLM): ...@@ -717,7 +726,7 @@ class VLLM(TemplateLM):
return re_ord.get_original(res) return re_ord.get_original(res)
@staticmethod @staticmethod
def _parse_logprobs(tokens: List, outputs, ctxlen: int) -> Tuple[float, bool]: def _parse_logprobs(tokens: list, outputs, ctxlen: int) -> tuple[float, bool]:
"""Process logprobs and tokens. """Process logprobs and tokens.
:param tokens: list :param tokens: list
......
This source diff could not be displayed because it is too large. You can view the blob instead.
# Titulm Bangla MMLU
This repository contains resources related to **Titulm Bangla MMLU**, a benchmark dataset designed for evaluating Bangla language models. The dataset is used for training, development, and comparative evaluation of language models in the Bangla language.
---
## Overview
**TituLLMs** is a family of Bangla large language models (LLMs) with comprehensive benchmarking designed to advance natural language processing for the Bangla language. The benchmark dataset `Titulm Bangla MMLU` covers multiple-choice questions across a diverse range of topics in Bangla.
This dataset is primarily used to train, validate, and evaluate Bangla language models and compare their performance with other existing models.
For more details, please refer to the original research paper:
[https://arxiv.org/abs/2502.11187](https://arxiv.org/abs/2502.11187)
---
## Dataset
The `Titulm Bangla MMLU` dataset can be found on Hugging Face:
[https://huggingface.co/datasets/hishab/titulm-bangla-mmlu](https://huggingface.co/datasets/hishab/titulm-bangla-mmlu)
This dataset was used as a benchmark in the development and evaluation of TituLLMs and related models.
---
## Usage
The dataset is intended for use within the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) repository to evaluate and compare the performance of Bangla language models.
---
## Note: The dataset can also be used to evaluate other models
### Other datasets like boolq, openbookqa ... soon to be added
## Citation
If you use this dataset or model, please cite the original paper:
```bibtex
@misc{nahin2025titullmsfamilybanglallms,
title={TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking},
author={Shahriar Kabir Nahin and Rabindra Nath Nandi and Sagor Sarker and Quazi Sarwar Muhtaseem and Md Kowsher and Apu Chandraw Shill and Md Ibrahim and Mehadi Hasan Menon and Tareq Al Muntasir and Firoj Alam},
year={2025},
eprint={2502.11187},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.11187},
}
task: bangla_mmlu
dataset_path: hishab/titulm-bangla-mmlu
dataset_name: all
description: "The following are multiple choice questions (with answers) about range of topics in Bangla"
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}} A. {{options[0]}} B. {{options[1]}} C. {{options[2]}} D. {{options[3]}} Answer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
# LongBench v2
### Paper
Title: `LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks`
Abstract: `This paper introduces LongBench v2, a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. LongBench v2 consists of 503 challenging multiple-choice questions, with contexts ranging from 8k to 2M words, across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding. To ensure the breadth and the practicality, we collect data from nearly 100 highly educated individuals with diverse professional backgrounds. We employ both automated and manual review processes to maintain high quality and difficulty, resulting in human experts achieving only 53.7% accuracy under a 15-minute time constraint. Our evaluation reveals that the best-performing model, when directly answers the questions, achieves only 50.1% accuracy. In contrast, the o1-preview model, which includes longer reasoning, achieves 57.7%, surpassing the human baseline by 4%. These results highlight the importance of enhanced reasoning ability and scaling inference-time compute to tackle the long-context challenges in LongBench v2.`
Homepage: `https://github.com/THUDM/LongBench`
### Citation
```
@article{bai2024longbench2,
title={LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks},
author={Yushi Bai and Shangqing Tu and Jiajie Zhang and Hao Peng and Xiaozhi Wang and Xin Lv and Shulin Cao and Jiazheng Xu and Lei Hou and Yuxiao Dong and Jie Tang and Juanzi Li},
journal={arXiv preprint arXiv:2412.15204},
year={2024}
}
```
### Groups, Tags, and Tasks
#### Groups
* `longbench2_single`: Single-document QA tasks requiring comprehension of documents across various domains (government, legal, literature, finance, academic, detective stories, and order of events)
* `longbench2_multi`: Multi-document QA tasks requiring information synthesis and reasoning across multiple documents in government, academic, finance, and news
* `longbench2_incontext`: Long in-context learning tasks including user guide comprehension, translation with examples, and many-shot learning scenarios
* `longbench2_history`: Long-dialogue history understanding tasks involving agent conversations and dialogue history comprehension
* `longbench2_structured`: Long structured data understanding tasks for graph and table data processing
#### Tags
* `longbench2`: Run the full benchmark with 503 multiple-choice questions (8k-2M words) testing understanding and reasoning on long-context tasks
#### Tasks
**Single-Document QA:**
* `longbench2_govt_single`: Question answering from single government documents
* `longbench2_legal_single`: Question answering from single legal documents
* `longbench2_lit_single`: Question answering from single literature/literary documents
* `longbench2_fin_single`: Question answering from single financial documents
* `longbench2_academic_single`: Question answering from single academic papers and research documents
* `longbench2_detective`: Question answering from detective stories requiring logical reasoning
* `longbench2_event_order`: Temporal reasoning tasks about event ordering in narratives
**Multi-Document QA:**
* `longbench2_govt_multi`: Question answering across multiple government documents
* `longbench2_academic_multi`: Question answering across multiple academic papers
* `longbench2_fin_multi`: Question answering across multiple financial documents
* `longbench2_news_multi`: Question answering across multiple news articles
**Long In-context Learning:**
* `longbench2_user_guide`: Comprehension and application of user guide instructions
* `longbench2_translate`: Translation tasks in new languages with long examples
* `longbench2_many_shot`: Few-shot learning with many examples in context
**Long-dialogue History Understanding:**
* `longbench2_agent_history`: Understanding and reasoning over extended agent conversation histories
* `longbench2_dialogue_history`: Understanding and reasoning over long dialogue exchanges
**Code Repository Understanding:**
* `longbench2_code`: Question answering on code repositories requiring codebase comprehension
**Long Structured Data Understanding:**
* `longbench2_graph`: Understanding and reasoning over graph-structured data
* `longbench2_table`: Understanding and reasoning over tabular data
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: longbench2
task:
- longbench2_history_tasks
- longbench2_incontext_tasks
- longbench2_multi_tasks
- longbench2_single_tasks
- longbench2_structured_tasks
- longbench2_code
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_history
group_alias: "Long-dialogue History Understanding"
task:
- longbench2_agent_history
- longbench2_dialogue_history
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_incontext
group_alias: "Long In-context Learning"
task:
- longbench2_user_guide
- longbench2_translate
- longbench2_many_shot
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_multi
group_alias: "Multi-Document QA"
task:
- longbench2_govt_multi
- longbench2_academic_multi
- longbench2_fin_multi
- longbench2_news_multi
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_single
group_alias: "Single-Document QA"
task:
- longbench2_govt_single
- longbench2_legal_single
- longbench2_lit_single
- longbench2_fin_single
- longbench2_event_order
- longbench2_academic_single
- longbench2_detective
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: longbench2_structured
group_alias: "Long Structured Data Understanding"
task:
- longbench2_graph
- longbench2_table
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
dataset_path: recursal/longbench-v2
test_split: train
output_type: multiple_choice
doc_to_text: "Please read the following text and answer the question below.\n\n<text>\n{{context}}\n</text>\n\nWhat is the correct answer to this question: {{question.strip()}}\nChoices:\n(A) {{choices[0]}}\n(B) {{choices[1]}}\n(C) {{choices[2]}}\n(D) {{choices[3]}}\n\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: _longbench_common_yaml
tag:
- longbench2_tasks
- longbench2_multi_tasks
task: longbench2_academic_multi
dataset_name: academic_multi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment