Merge branch 'main' into comma

3e8135ce · Baber · 8e560c96 · 0c134ee9 · 3e8135ce · 3e8135ce
Commit 3e8135ce authored Sep 16, 2025 by Baber
20 changed files
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
@@ -2,7 +2,7 @@ import logging
 import os


-__version__ = "0.4.9"
+__version__ = "0.4.9.1"


 # Lazy-load .evaluator module to improve CLI startup

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -5,8 +5,9 @@ import traceback
 from typing import Iterator, List, Sequence, Tuple, TypeVar


-# This is a cpp module. Compile janitor_util.cpp with:
-# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
+# This is a cpp module.
+# See scripts/clean_training_data/README.md for instructions to compile janitor_util.cpp
+
 try:
    import janitor_util


--- a/lm_eval/models/hf_steered.py
+++ b/lm_eval/models/hf_steered.py
@@ -71,13 +71,6 @@ class SteeredModel(HFLM):
        """
        HFLM with a steered forward pass.

-        To derive steering vectors from a sparse model loadable with sparsify or sae_lens,
-        provide the path to a CSV file with the following columns (example rows are provided below):
-
-        loader,action,sparse_model,hookpoint,feature_index,steering_coefficient,sae_id,description,
-        sparsify,add,EleutherAI/sae-pythia-70m-32k,layers.3,30,10.0,,,
-        sae_lens,add,gemma-scope-2b-pt-res-canonical,layers.20,12082,240.0,layer_20/width_16k/canonical,increase dogs,
-
        To load steering vectors directly, provide the path to a pytorch (.pt) file with content in the following format:

        {
@@ -86,9 +79,17 @@ class SteeredModel(HFLM):
                "steering_coefficient": <float>,
                "action": <Literal["add", "clamp"]>,
                "bias": <torch.Tensor | None>,
+                "head_index": <int | None>,
            },
            ...
        }
+
+        To derive steering vectors from a sparse model loadable with sparsify or sae_lens,
+        provide the path to a CSV file with the following columns (example rows are provided below):
+
+        loader,action,sparse_model,hookpoint,feature_index,steering_coefficient,head_index,sae_id,description,
+        sparsify,add,EleutherAI/sae-pythia-70m-32k,layers.3,30,10.0,,,,
+        sae_lens,add,gemma-scope-2b-pt-res-canonical,layers.20,12082,240.0,,layer_20/width_16k/canonical,increase dogs,
        """
        super().__init__(pretrained=pretrained, device=device, **kwargs)

@@ -105,27 +106,31 @@ class SteeredModel(HFLM):
        hook_to_steer = {}
        for hookpoint, steer_info in steer_config.items():
            action = steer_info["action"]
-            steering_coefficient = steer_info["steering_coefficient"]
            steering_vector = (
                steer_info["steering_vector"].to(self.device).to(self.model.dtype)
            )
-            bias = (
-                steer_info["bias"].to(self.device).to(self.model.dtype)
-                if steer_info["bias"] is not None
-                else None
-            )
+            steering_coefficient = float(steer_info.get("steering_coefficient", 1.0))
+            head_index = steer_info.get("head_index", None)
+            bias = steer_info.get("bias", None)
+            if bias is not None:
+                bias = bias.to(self.device).to(self.model.dtype)

            if action == "add":
-                # Steers the model by adding some multiple of a steering vector to all sequence positions.
-                hook_to_steer[hookpoint] = (
-                    lambda acts: acts + steering_coefficient * steering_vector
+                # Steer the model by adding a multiple of a steering vector to all sequence positions.
+                assert bias is None, "Bias is not supported for the `add` action."
+                hook_to_steer[hookpoint] = partial(
+                    self.add,
+                    vector=steering_vector * steering_coefficient,
+                    head_index=head_index,
                )
            elif action == "clamp":
+                # Steer the model by clamping the activations to a value in the direction of the steering vector.
                hook_to_steer[hookpoint] = partial(
                    self.clamp,
-                    steering_vector=steering_vector,
+                    direction=steering_vector / torch.norm(steering_vector),
                    value=steering_coefficient,
                    bias=bias,
+                    head_index=head_index,
                )
            else:
                raise ValueError(f"Unknown hook type: {action}")
@@ -195,34 +200,62 @@ class SteeredModel(HFLM):

        return steer_data

+    @classmethod
+    def add(
+        cls,
+        acts: Tensor,
+        vector: Tensor,
+        head_index: Optional[int],
+    ):
+        """Adds the given vector to the activations.
+
+        Args:
+            acts (Tensor): The activations tensor to edit of shape [batch, pos, ..., features]
+            vector (Tensor): A vector to add of shape [features]
+            head_index (int | None): Optional attention head index to add to
+        """
+        if head_index is not None:
+            acts[:, :, head_index, :] = acts[:, :, head_index, :] + vector
+        else:
+            acts = acts + vector
+
+        return acts
+
    @classmethod
    def clamp(
        cls,
        acts: Tensor,
-        steering_vector: Tensor,
+        direction: Tensor,
        value: float,
+        head_index: Optional[int],
        bias: Optional[Tensor] = None,
    ):
-        """Clamps a direction of the activations to be the steering vector * the value.
+        """Clamps the activations to a given value in a specified direction. The direction
+        must be a unit vector.

        Args:
-            acts (Tensor): The activations tensor to edit of shape [batch, pos, features]
-            steering_vector (Tensor): A direction to clamp of shape [features]
+            acts (Tensor): The activations tensor to edit of shape [batch, pos, ..., features]
+            direction (Tensor): A direction to clamp of shape [features]
            value (float): Value to clamp the direction to
+            head_index (int | None): Optional attention head index to clamp
            bias (Tensor | None): Optional bias to add to the activations

        Returns:
            Tensor: The modified activations with the specified direction clamped
        """
-
        if bias is not None:
            acts = acts - bias

-        direction = steering_vector / torch.norm(steering_vector)
-        proj_magnitude = torch.sum(acts * direction, dim=-1, keepdim=True)
-        orthogonal_component = acts - proj_magnitude * direction
+        if head_index is not None:
+            x = acts[:, :, head_index, :]
+            proj = (x * direction).sum(dim=-1, keepdim=True)
+            assert proj == acts @ direction

-        clamped = orthogonal_component + direction * value
+            clamped = acts.clone()
+            clamped[:, :, head_index, :] = x + direction * (value - proj)
+        else:
+            proj = torch.sum(acts * direction, dim=-1, keepdim=True)
+            clamped = acts + direction * (value - proj)

        if bias is not None:
            return clamped + bias

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -680,10 +680,19 @@ class HFLM(TemplateLM):
                "0.4.0"
            ):
                raise AssertionError("load_in_4bit requires peft >= 0.4.0")
-            if self._model.config.vocab_size != len(self.tokenizer):
+
+            # Compatible with Gemma3 (multimodal) and old models
+            if hasattr(self._model.config, "text_config") and hasattr(
+                self._model.config.text_config, "vocab_size"
+            ):
+                vocab_size = self._model.config.text_config.vocab_size
+            else:
+                vocab_size = self._model.config.vocab_size
+
+            if vocab_size != len(self.tokenizer):
                # resize model for LoRAs with added tokens
                eval_logger.info(
-                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                    f"Model config indicates vocab_size='{vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
                )
                self._model.resize_token_embeddings(len(self.tokenizer))
            self._model = PeftModel.from_pretrained(

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -289,7 +289,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
            "seed": seed,
            **gen_kwargs,
        }
-        if "o1" in self.model:
+        if "o1" in self.model or "5" in self.model:
            output.pop("stop")
            output["temperature"] = 1
        elif "o3" in self.model:

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
@@ -28,9 +28,8 @@ class OptimumLM(HFLM):
        **kwargs,
    ) -> None:
        if "backend" in kwargs:
-            # optimum currently only supports causal models
-            assert kwargs["backend"] == "causal", (
-                "Currently, only OVModelForCausalLM is supported."
+            assert kwargs["backend"] in ["causal", "seq2seq"], (
+                "Currently, only OVModelForCausalLM or OVModelForSeq2SeqLM are supported."
            )

        self.openvino_device = device
@@ -54,7 +53,7 @@ class OptimumLM(HFLM):
                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
            )
        else:
-            from optimum.intel.openvino import OVModelForCausalLM
+            from optimum.intel.openvino import OVModelForCausalLM, OVModelForSeq2SeqLM

        model_kwargs = kwargs if kwargs else {}
        if "ov_config" in model_kwargs:
@@ -76,17 +75,14 @@ class OptimumLM(HFLM):
                model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
                    "PIPELINE_PARALLEL"
                )
-        model_file = Path(pretrained) / "openvino_model.xml"
-        if model_file.exists():
-            export = False
-        else:
-            export = True

-        self._model = OVModelForCausalLM.from_pretrained(
+        model_cls = (
+            OVModelForCausalLM if self.backend == "causal" else OVModelForSeq2SeqLM
+        )
+        self._model = model_cls.from_pretrained(
            pretrained,
            revision=revision,
            trust_remote_code=trust_remote_code,
-            export=export,
            device=self.openvino_device.upper(),
            **model_kwargs,
        )
--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
@@ -216,7 +216,7 @@ class SGLangLM(TemplateLM):
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
-        re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
+        re_ords = Collator(requests, _collate_gen, group_by=None)
        chunks = re_ords.get_batched(
            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
        )
@@ -232,36 +232,41 @@ class SGLangLM(TemplateLM):
            context_and_encoding, all_gen_kwargs = zip(*chunk)
            context, context_encoding = zip(*context_and_encoding)

-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-            # unpack our keyword arguments.
-            if isinstance(gen_kwargs, dict):
-                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                # add EOS token to stop sequences
-                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
-            else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+            context_encoding_truncated = []
+            sampling_params = []
+            for x, gen_kwargs in zip(context_encoding, all_gen_kwargs):
+                # unpack our keyword arguments.
+                if isinstance(gen_kwargs, dict):
+                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    # add EOS token to stop sequences
+                    until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+                else:
+                    raise ValueError(
+                        f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                    )
+                if "max_gen_toks" in kwargs.keys():
+                    max_gen_toks = kwargs.pop("max_gen_toks")
+                else:
+                    max_gen_toks = self.max_gen_toks
+
+                # set the max length in tokens of inputs ("context_enc")
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+                if len(x) > max_ctx_len:
+                    context_encoding_truncated.append(x[-max_ctx_len:])
+                else:
+                    context_encoding_truncated.append(x)
+                # create sampling params
+                kwargs = self.modify_gen_kwargs(kwargs)
+                sampling_params.append(
+                    kwargs | {"max_tokens": max_gen_toks, "stop": until}
                )
-            if "max_gen_toks" in kwargs.keys():
-                max_gen_toks = kwargs.pop("max_gen_toks")
-            else:
-                max_gen_toks = self.max_gen_toks
-
-            # set the max length in tokens of inputs ("context_enc")
-            # max len for inputs = max length, minus room to generate the max new tokens
-            max_ctx_len = self.max_length - max_gen_toks
-            context_encoding = [x[-max_ctx_len:] for x in context_encoding]
-
            # perform batched generation
            # cont is a list of dic. See here https://github.com/sgl-project/sglang/blob/0a6f18f068e4095fc228e798454e8496c9749214/python/sglang/srt/entrypoints/engine.py#L111 .
            cont = self._model_generate(
-                requests=context_encoding,
+                requests=context_encoding_truncated,
                generate=True,
-                max_tokens=max_gen_toks,
-                stop=until,
-                **kwargs,
+                sampling_params=sampling_params,
            )

            # cache generations
@@ -284,28 +289,22 @@ class SGLangLM(TemplateLM):
        self,
        requests: List[List[int]] = None,
        generate: bool = False,
-        max_tokens: int = None,
-        stop: Optional[List[str]] = None,
+        sampling_params: Union[List[Dict], Dict, None] = None,
        return_logprob: bool = False,
        top_logprobs_num: int = 1,
        logprob_start_len: int = -1,
-        **kwargs,
    ):
        # check sglang sampling parameters: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/sampling/sampling_params.py#L21  and https://docs.sglang.ai/references/sampling_params.html.
-        if generate:
-            kwargs = self.modify_gen_kwargs(kwargs)
-            sampling_params = {
-                "max_new_tokens": max_tokens,
-                "stop": stop,
-            }
-            sampling_params.update(kwargs)
-        else:
-            sampling_params = {
-                "temperature": 0,
-                "max_new_tokens": 1,
-            }
-            sampling_params.update(kwargs)
-
+        if not generate:
+            sampling_params = sampling_params if sampling_params else {}
+            sampling_params.update(
+                {
+                    "temperature": 0,
+                    "max_new_tokens": 1,
+                }
+            )
+        if not isinstance(sampling_params, List):
+            sampling_params = [sampling_params] * len(requests)
        # Refer to:  https://docs.sglang.ai/backend/offline_engine_api.html
        outputs = self.model.generate(
            input_ids=requests,

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
 import gc
-import inspect
 import logging
 import os
 from importlib.metadata import version
@@ -33,7 +32,7 @@ from lm_eval.utils import (

 try:
    import ray
-    from vllm import LLM, SamplingParams
+    from vllm import LLM, SamplingParams, TokensPrompt
    from vllm.lora.request import LoRARequest
    from vllm.transformers_utils.tokenizer import get_tokenizer
    from vllm.utils import get_open_port
@@ -51,7 +50,7 @@ eval_logger = logging.getLogger(__name__)

 def _vllm_mp_worker(
    model_args: dict,
-    sampling_params: "SamplingParams",
+    sampling_params: list["SamplingParams"],
    requests: list[list[int]],
    lora_request: "LoRARequest",
    result_queue: "Queue",
@@ -79,7 +78,7 @@ def _vllm_mp_worker(
    try:
        llm = LLM(**model_args)
        res = llm.generate(
-            prompt_token_ids=requests,
+            [TokensPrompt(prompt_token_ids=request) for request in requests],
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
@@ -196,6 +195,12 @@ class VLLM(TemplateLM):
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")

+        if "gemma" in pretrained.lower():
+            add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
+            )
+
        from transformers import AutoConfig

        self._config = AutoConfig.from_pretrained(
@@ -214,11 +219,6 @@ class VLLM(TemplateLM):
            "enable_thinking", enable_thinking
        )
        self.add_bos_token = add_bos_token
-        if "gemma" in pretrained.lower():
-            self.add_bos_token = True
-            eval_logger.info(
-                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
-            )

        if parse_version(version("vllm")) >= parse_version("0.8.3"):
            kwargs_resolve_hf_chat_template = {
@@ -239,13 +239,6 @@ class VLLM(TemplateLM):
                    model_config = engine_args.create_model_config()

                    kwargs_resolve_hf_chat_template["model_config"] = model_config
-
-            # https://github.com/vllm-project/vllm/pull/18259
-            if (
-                "trsut_remote_code"
-                in inspect.signature(resolve_hf_chat_template).parameters
-            ):
-                kwargs_resolve_hf_chat_template["trsut_remote_code"] = trust_remote_code
            else:
                kwargs_resolve_hf_chat_template["trust_remote_code"] = trust_remote_code

@@ -371,17 +364,14 @@ class VLLM(TemplateLM):
        self,
        requests: List[List[int]] = None,
        generate: bool = False,
-        max_tokens: int = None,
-        stop: Optional[List[str]] = None,
-        **kwargs,
+        sampling_params: Union[List["SamplingParams"], "SamplingParams", None] = None,
    ):
-        if generate:
-            kwargs = self.modify_gen_kwargs(kwargs)
-            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
-        else:
+        if not generate or sampling_params is None:
            sampling_params = SamplingParams(
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
+        if not isinstance(sampling_params, List):
+            sampling_params = [sampling_params] * len(requests)
        if self.data_parallel_size > 1 and not self.V1:
            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
@@ -389,13 +379,13 @@ class VLLM(TemplateLM):
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
-                sampling_params: SamplingParams,
+                sampling_params: List["SamplingParams"],
                requests: List[List[int]],
-                lora_request: LoRARequest,
+                lora_request: "LoRARequest",
            ):
                llm = LLM(**model_args)
                return llm.generate(
-                    prompt_token_ids=requests,
+                    [TokensPrompt(prompt_token_ids=request) for request in requests],
                    sampling_params=sampling_params,
                    lora_request=lora_request,
                )
@@ -403,9 +393,12 @@ class VLLM(TemplateLM):
            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
            # interleaved important to balance context lengths across workers
            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
+            sampling_params = [
+                list(sp) for sp in distribute(self.data_parallel_size, sampling_params)
+            ]
            inputs = (
-                (self.model_args, sampling_params, req, self.lora_request)
-                for req in requests
+                (self.model_args, sp, req, self.lora_request)
+                for req, sp in zip(requests, sampling_params)
            )
            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
            results = ray.get(object_refs)
@@ -420,16 +413,18 @@ class VLLM(TemplateLM):
            dp_master_port = os.environ.get("VLLM_DP_MASTER_PORT") or get_open_port()

            requests = (list(x) for x in distribute(self.data_parallel_size, requests))
-
+            sampling_params = (
+                list(sp) for sp in distribute(self.data_parallel_size, sampling_params)
+            )
            procs, resq = [], Queue()
            # We use Process as it is non-daemonic
            try:
-                for rank, req in enumerate(requests):
+                for rank, (sp, req) in enumerate(zip(requests, sampling_params)):
                    proc = Process(
                        target=_vllm_mp_worker,
                        args=(
                            self.model_args.copy(),
-                            sampling_params,
+                            sp,
                            req,
                            self.lora_request,
                            resq,
@@ -484,7 +479,7 @@ class VLLM(TemplateLM):

        else:
            outputs = self.model.generate(
-                prompt_token_ids=requests,
+                [TokensPrompt(prompt_token_ids=request) for request in requests],
                sampling_params=sampling_params,
                use_tqdm=True if self.batch_size == "auto" else False,
                lora_request=self.lora_request,
@@ -583,10 +578,11 @@ class VLLM(TemplateLM):
            # - any OOMs will happen right away rather than near the end
            return -len(_requests[0][1]), _requests[0][0]

-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
+        re_ords = Collator(
+            requests,
+            _collate_gen,
+            group_by=None,
+        )
        chunks = re_ords.get_batched(
            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
        )
@@ -601,41 +597,44 @@ class VLLM(TemplateLM):
        for chunk in chunks:
            context_and_encoding, all_gen_kwargs = zip(*chunk)
            context, context_encoding = zip(*context_and_encoding)
-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-            # unpack our keyword arguments.
-            if isinstance(gen_kwargs, dict):
-                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                # add EOS token to stop sequences
-                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
-            else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
-                )
-            if "max_gen_toks" in kwargs.keys():
-                max_gen_toks = kwargs.pop("max_gen_toks")
-            else:
-                max_gen_toks = self.max_gen_toks
-
-            # set the max length in tokens of inputs ("context_enc")
-            # max len for inputs = max length, minus room to generate the max new tokens
-            max_ctx_len = self.max_length - max_gen_toks
-            all_lengths = [len(x) for x in context_encoding]
-            for length in all_lengths:
-                if length > max_ctx_len:
+            context_encoding_truncated = []
+            sampling_params = []
+            for x, gen_kwargs in zip(context_encoding, all_gen_kwargs):
+                # unpack our keyword arguments.
+                if isinstance(gen_kwargs, dict):
+                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    # add EOS token to stop sequences
+                    until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+                else:
+                    raise ValueError(
+                        f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                    )
+                if "max_gen_toks" in kwargs.keys():
+                    max_gen_toks = kwargs.pop("max_gen_toks")
+                else:
+                    max_gen_toks = self.max_gen_toks
+
+                # set the max length in tokens of inputs ("context_enc")
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+                if len(x) > max_ctx_len:
                    eval_logger.warning(
-                        f"Context length {length} exceeds max length (context + max gen tokens): {max_ctx_len}. Truncating context."
+                        f"Context length {len(x)} exceeds max length (context + max gen tokens): {max_ctx_len}. Truncating context."
                    )
-            context_encoding = [x[-max_ctx_len:] for x in context_encoding]
+                    context_encoding_truncated.append(x[-max_ctx_len:])
+                else:
+                    context_encoding_truncated.append(x)
+                # create sampling params
+                kwargs = self.modify_gen_kwargs(kwargs)
+                sampling_params.append(
+                    SamplingParams(max_tokens=max_gen_toks, stop=until, **kwargs)
+                )

            # perform batched generation
            cont = self._model_generate(
-                requests=context_encoding,
+                requests=context_encoding_truncated,
                generate=True,
-                max_tokens=max_gen_toks,
-                stop=until,
-                **kwargs,
+                sampling_params=sampling_params,
            )

            # cache generations

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
-
 # Tasks

- A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.
+A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.

- For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
+For more information, including a full list of task names and their precise meanings or sources, follow the links
+provided to the individual README.md files for each subfolder.

 | Task Family                                                              | Description                                                                                                                                                                                                                                                                                                                            | Language(s)                                                                                                                   |
 |--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
@@ -29,9 +29,12 @@
 | [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts.                                                                                                                                                                                                                                                                    | Multiple (122 languages)                                                                                                      |
 | benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities.                                                                                                                                                                                                                                              |                                                                                                                               |
 | [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages.                                                                                                                                                                                                                                                                 | English, Basque, Basque (MT)                                                                                                  |
+| [bhs](bhs/README.md)                                           | Grammatical knowledge evaluation for low-resource langauges. | Basque, Hindi, Swahili                                                                                                                                                                                                                                              |
 | [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models.                                                                                                                                                                                                                                              | Multiple                                                                                                                      |
 | [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities.                                                                                                                                                                                                                                              | English                                                                                                                       |
+| [blimp_nl](blimp_nl/README.md)                                           | A benchmark evaluating language models' grammatical capabilities in Dutch based on comparing the probabilities of minimal pairs of grammatical and ungrammatical sentences.                                                                                                                                                            | Dutch                                                                                                                         |
 | [c4](c4/README.md)                                                       | Tasks based on a colossal, cleaned version of Common Crawl's web crawl corpus to assess models' language modeling capabilities.                                                                                                                                                                                                        | English                                                                                                                       |
+| [cabbq](cabbq/README.md)                                                 | Adaptation of the [BBQ](bbq/README.md) benchmark to the Catalan language and stereotypes prevalent in Spain.                                                                                                                                                                                                                           | Catalan                                                                                                                       |
 | [careqa](careqa/README.md)                                               | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams.                                                                                                                                                                                                            | English, Spanish                                                                                                              |
 | [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Catalan                                                                                                                       |
 | [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context.                                                                                                                                                                                                                                                    | Chinese                                                                                                                       |
@@ -41,14 +44,17 @@
 | [copal_id](copal_id/README.md)                United States              | Indonesian causal commonsense reasoning dataset that captures local nuances.                                                                                                                                                                                                                                                           | Indonesian                                                                                                                    |
 | [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding.                                                                                                                                                                                                                                                                  | English                                                                                                                       |
 | [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups.                                                                                                                                                                                                                                                                | English, French                                                                                                               |
+| [click](click/README.md)                                                 | A benchmark dataset of Cultural and Linguistic Intelligence in Korean (CLIcK), comprising 1,995 QA pairs sourced from official Korean exams and textbooks to test Korean cultural and linguistic knowledge.                                                                                                                            | Korean                                                                                                                        |
 | csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment.                                                                                                                                                                                                                                                 | Korean                                                                                                                        |
 | [darija_bench](darija_bench/README.md)                                   | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija                                                                                                                                                                                                                                                           | Moroccan Darija (some MT)                                                                                                     |
 | [darijahellaswag](darijahellaswag/README.md)                             | Moroccan Darija version of HellaSwag.                                                                                                                                                                                                                                                                                                  | Moroccan Darija (MT)                                                                                                          |
 | [darijammlu](darijammlu/README.md)                                       | Multiple-choice QA in Moroccan Darija (an Arabic dialect).                                                                                                                                                                                                                                                                             | Moroccan Darija (MT)                                                                                                          |
+| [discrim_eval](discrim_eval/README.md)                                     | Prompts for binary decisions covering 70 scenarios to evaluate demographic bias. | English |
 | [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                                       |
 | [egyhellaswag](egyhellaswag/README.md)                                   | Egyptian Arabic (Masri) version of HellaSwag.                                                                                                                                                                                                                                                                                          | Egyptian Arabic (MT)                                                                                                          |
 | [egymmlu](egymmlu/README.md)                                             | Multiple-choice QA in Egyptian Arabic.                                                                                                                                                                                                                                                                                                 | Egyptian Arabic (MT)                                                                                                          |
 | [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                                       |
+| [esbbq](esbbq/README.md)                                                   | Adaptation of the [BBQ](bbq/README.md) benchmark to the Spanish language and stereotypes prevalent in Spain.                                                                                                                                                                                                                           | Spanish                                                                                                                       |
 | [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                        |
 | [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                        |
 | [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language.                                                                                                                                                                                                                                                             | Basque                                                                                                                        |
@@ -71,6 +77,7 @@
 | [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                                    | French (Some MT)                                                                                                              |
 | [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English.                                                                                                                                                                                                                                                               | Korean (Some MT), English (Some MT)                                                                                           |
 | [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings.                                                                                                                                                                                                                                    | Python                                                                                                                        |
+| [icelandic_winogrande](icelandic_winogrande/README.md)                                       | Manually translated and localized version of the [WinoGrande](winogrande/README.md) commonsense reasoning benchmark for Icelandic.                                                                                                                                                                                                                                         | Icelandic                                                                                                                       |
 | [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning.                                                                                                                                                                                                                                                        | English                                                                                                                       |
 | [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse.                                                                                                                                                                                                            | English                                                                                                                       |
 | [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects.                                                                                                                                                                                                                                    | Japanese                                                                                                                      |
@@ -85,9 +92,12 @@
 | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                                  |
 | [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                                       |
 | [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                         |
-| [libra](libra/README.md)                                                 | Evaluates long-context understanding in Russian across four complexity levels                                                                                                                                                                                                                                                          | Russian (MT)                                                                                                               |
+| [llama3](llama3/README.md)                                               | Evals reproducing those provided by the LLAMA team in the Hugging Face repo (instruct)                                                                                                                                                                                                                                                 | English, Multilingual                                                                                                         |
+| [libra](libra/README.md)                                                 | Evaluates long-context understanding in Russian across four complexity levels                                                                                                                                                                                                                                                          | Russian (MT)                                                                                                                  |
+| [lm_syneval](lm_syneval/README.md)                                       | Evaluates the syntactic capabilities of language models.                                                                                                                                                                                                                                                                               | English                                                                                                                       |
 | [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                              |
 | [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                              |
+| [longbench](longbench/README.md)                                         | LongBench evaluates language models' ability to understand lengthy texts across multiple tasks and languages.                                                                                                                                                                                                                          | English, Chinese                                                                                                              |
 | [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                                       |
 | [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving.                                                                                                                                                                                                                                                         | English                                                                                                                       |
 | [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions.                                                                                                                                                                                                                    | Python                                                                                                                        |
@@ -107,7 +117,7 @@
 | [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported.                                                                                                                                                                                                               | English                                                                                                                       |
 | [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options.                                                                                                                                                                                                | English                                                                                                                       |
 | [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali                |
+| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Zulu, Swahili, Wolof, Yoruba, Thai, Arabic, Hindi, Bengali, Serbian, Hungarian, Vietnamese, Czech, Marathi, Afrikaans, Nepali, Telugu, Urdu, Russian, Indonesian, Italian, Ukrainian|
 | [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous.                                                                                                                                                                                                                                                                                        | English                                                                                                                       |
 | model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns.                                                                                                                                                                                                                                                     |                                                                                                                               |
 | [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                      | English                                                                                                                       |
@@ -156,6 +166,7 @@
 | [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                                       |
 | [truthfulqa-multi](truthfulqa-multi/README.md)                           | Is a multilingual version of TruthfulQA, a QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                       | English, Spanish, Catalan, Basque, Galician                                                                                   |
 | [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                                       |
+| [turblimp_core](turblimp/README.md)                                      | A benchmark evaluating language models' grammatical capabilities in Turkish based on comparing the probabilities of minimal pairs of grammatical and ungrammatical sentences.                                                                                                                                                          | Turkish                                                                                                                       |
 | [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                                       |
 | [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                                       |
 | [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                                       |
@@ -171,8 +182,10 @@
 | [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages.                                                                                                                                                                                                                                                                        | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                         |
 | [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages.                                                                                                                                                                                                                                            | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                             |
 | [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages.                                                                                                                                                                                                                                                  | English, French, Japanese, Portuguese, Russian, Chinese                                                                       |
+| [zhoblimp](zhoblimp/README.md)                                           | A benchmark evaluating language models' grammatical capabilities in Chinese based on comparing the probabilities of minimal pairs of grammatical and ungrammatical sentences.                                                                                                                                                          | Chinese                                                                                                                       |

 ## Multimodal Tasks
+
 | Task Family                  | Description                                                                                             | Modality    |
 |------------------------------|---------------------------------------------------------------------------------------------------------|-------------|
 | [chartqa](chartqa/README.md) | A benchmark for question answering about charts that requires both visual and logical reasoning.        | Image, Text |

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -81,7 +81,7 @@ class TaskManager:
        task_index = {}
        for task_dir in all_paths:
            tasks = self._get_task_and_group(task_dir)
-            task_index = {**tasks, **task_index}
+            task_index = {**task_index, **tasks}

        return task_index


--- a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_1
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_2
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_3
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_4
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_5
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti
@@ -4,7 +4,6 @@ tag:
 task: null
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisent_prompt_2
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisenti_prompt_3
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisenti_prompt_4
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisenti_prompt_5
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test