Merge branch 'main' into metrics

# Conflicts: # lm_eval/models/vllm_causallms.py # pyproject.toml

Merge branch 'main' into metrics
# Conflicts: # lm_eval/models/vllm_causallms.py # pyproject.toml
c4b0c0cb · Baber · 6b20ae8c · de496b80 · c4b0c0cb · c4b0c0cb
Commit c4b0c0cb authored Sep 24, 2025 by Baber
20 changed files
--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -5,8 +5,9 @@ import traceback
 from typing import Iterator, List, Sequence, Tuple, TypeVar


-# This is a cpp module. Compile janitor_util.cpp with:
-# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
+# This is a cpp module.
+# See scripts/clean_training_data/README.md for instructions to compile janitor_util.cpp
+
 try:
    import janitor_util


--- a/lm_eval/models/hf_steered.py
+++ b/lm_eval/models/hf_steered.py
@@ -71,13 +71,6 @@ class SteeredModel(HFLM):
        """
        HFLM with a steered forward pass.

-        To derive steering vectors from a sparse model loadable with sparsify or sae_lens,
-        provide the path to a CSV file with the following columns (example rows are provided below):
-
-        loader,action,sparse_model,hookpoint,feature_index,steering_coefficient,sae_id,description,
-        sparsify,add,EleutherAI/sae-pythia-70m-32k,layers.3,30,10.0,,,
-        sae_lens,add,gemma-scope-2b-pt-res-canonical,layers.20,12082,240.0,layer_20/width_16k/canonical,increase dogs,
-
        To load steering vectors directly, provide the path to a pytorch (.pt) file with content in the following format:

        {
@@ -86,9 +79,17 @@ class SteeredModel(HFLM):
                "steering_coefficient": <float>,
                "action": <Literal["add", "clamp"]>,
                "bias": <torch.Tensor | None>,
+                "head_index": <int | None>,
            },
            ...
        }
+
+        To derive steering vectors from a sparse model loadable with sparsify or sae_lens,
+        provide the path to a CSV file with the following columns (example rows are provided below):
+
+        loader,action,sparse_model,hookpoint,feature_index,steering_coefficient,head_index,sae_id,description,
+        sparsify,add,EleutherAI/sae-pythia-70m-32k,layers.3,30,10.0,,,,
+        sae_lens,add,gemma-scope-2b-pt-res-canonical,layers.20,12082,240.0,,layer_20/width_16k/canonical,increase dogs,
        """
        super().__init__(pretrained=pretrained, device=device, **kwargs)

@@ -105,27 +106,31 @@ class SteeredModel(HFLM):
        hook_to_steer = {}
        for hookpoint, steer_info in steer_config.items():
            action = steer_info["action"]
-            steering_coefficient = steer_info["steering_coefficient"]
            steering_vector = (
                steer_info["steering_vector"].to(self.device).to(self.model.dtype)
            )
-            bias = (
-                steer_info["bias"].to(self.device).to(self.model.dtype)
-                if steer_info["bias"] is not None
-                else None
-            )
+            steering_coefficient = float(steer_info.get("steering_coefficient", 1.0))
+            head_index = steer_info.get("head_index", None)
+            bias = steer_info.get("bias", None)
+            if bias is not None:
+                bias = bias.to(self.device).to(self.model.dtype)

            if action == "add":
-                # Steers the model by adding some multiple of a steering vector to all sequence positions.
-                hook_to_steer[hookpoint] = (
-                    lambda acts: acts + steering_coefficient * steering_vector
+                # Steer the model by adding a multiple of a steering vector to all sequence positions.
+                assert bias is None, "Bias is not supported for the `add` action."
+                hook_to_steer[hookpoint] = partial(
+                    self.add,
+                    vector=steering_vector * steering_coefficient,
+                    head_index=head_index,
                )
            elif action == "clamp":
+                # Steer the model by clamping the activations to a value in the direction of the steering vector.
                hook_to_steer[hookpoint] = partial(
                    self.clamp,
-                    steering_vector=steering_vector,
+                    direction=steering_vector / torch.norm(steering_vector),
                    value=steering_coefficient,
                    bias=bias,
+                    head_index=head_index,
                )
            else:
                raise ValueError(f"Unknown hook type: {action}")
@@ -195,34 +200,62 @@ class SteeredModel(HFLM):

        return steer_data

+    @classmethod
+    def add(
+        cls,
+        acts: Tensor,
+        vector: Tensor,
+        head_index: Optional[int],
+    ):
+        """Adds the given vector to the activations.
+
+        Args:
+            acts (Tensor): The activations tensor to edit of shape [batch, pos, ..., features]
+            vector (Tensor): A vector to add of shape [features]
+            head_index (int | None): Optional attention head index to add to
+        """
+        if head_index is not None:
+            acts[:, :, head_index, :] = acts[:, :, head_index, :] + vector
+        else:
+            acts = acts + vector
+
+        return acts
+
    @classmethod
    def clamp(
        cls,
        acts: Tensor,
-        steering_vector: Tensor,
+        direction: Tensor,
        value: float,
+        head_index: Optional[int],
        bias: Optional[Tensor] = None,
    ):
-        """Clamps a direction of the activations to be the steering vector * the value.
+        """Clamps the activations to a given value in a specified direction. The direction
+        must be a unit vector.

        Args:
-            acts (Tensor): The activations tensor to edit of shape [batch, pos, features]
-            steering_vector (Tensor): A direction to clamp of shape [features]
+            acts (Tensor): The activations tensor to edit of shape [batch, pos, ..., features]
+            direction (Tensor): A direction to clamp of shape [features]
            value (float): Value to clamp the direction to
+            head_index (int | None): Optional attention head index to clamp
            bias (Tensor | None): Optional bias to add to the activations

        Returns:
            Tensor: The modified activations with the specified direction clamped
        """
-
        if bias is not None:
            acts = acts - bias

-        direction = steering_vector / torch.norm(steering_vector)
-        proj_magnitude = torch.sum(acts * direction, dim=-1, keepdim=True)
-        orthogonal_component = acts - proj_magnitude * direction
+        if head_index is not None:
+            x = acts[:, :, head_index, :]
+            proj = (x * direction).sum(dim=-1, keepdim=True)
+            assert proj == acts @ direction

-        clamped = orthogonal_component + direction * value
+            clamped = acts.clone()
+            clamped[:, :, head_index, :] = x + direction * (value - proj)
+        else:
+            proj = torch.sum(acts * direction, dim=-1, keepdim=True)
+            clamped = acts + direction * (value - proj)

        if bias is not None:
            return clamped + bias

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -124,14 +124,22 @@ class HFLM(TemplateLM):
            assert isinstance(pretrained, str)
            assert isinstance(batch_size, (int, str))

-            gpus = torch.cuda.device_count()
            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
            if accelerator.num_processes > 1:
                self.accelerator = accelerator

-            if "npu" in accelerator.device.type:
+            # Detect device count based on accelerator device type
+            device_type = accelerator.device.type
+            if "cuda" in device_type:
+                gpus = torch.cuda.device_count()
+            elif "npu" in device_type:
                gpus = torch.npu.device_count()
+            elif "xpu" in device_type:
+                gpus = torch.xpu.device_count()
+            else:
+                # Fallback to CUDA count for compatibility
+                gpus = torch.cuda.device_count()

            # using one process with no model parallelism
            if not (parallelize or accelerator.num_processes > 1):
@@ -141,6 +149,7 @@ class HFLM(TemplateLM):
                    + [f"cuda:{i}" for i in range(gpus)]
                    + ["mps", "mps:0"]
                    + [f"npu:{i}" for i in range(gpus)]
+                    + [f"xpu:{i}" for i in range(gpus)]
                )
                if device and device in device_list:
                    self._device = torch.device(device)
@@ -679,10 +688,19 @@ class HFLM(TemplateLM):
                "0.4.0"
            ):
                raise AssertionError("load_in_4bit requires peft >= 0.4.0")
-            if self._model.config.vocab_size != len(self.tokenizer):
+
+            # Compatible with Gemma3 (multimodal) and old models
+            if hasattr(self._model.config, "text_config") and hasattr(
+                self._model.config.text_config, "vocab_size"
+            ):
+                vocab_size = self._model.config.text_config.vocab_size
+            else:
+                vocab_size = self._model.config.vocab_size
+
+            if vocab_size != len(self.tokenizer):
                # resize model for LoRAs with added tokens
                eval_logger.info(
-                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                    f"Model config indicates vocab_size='{vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
                )
                self._model.resize_token_embeddings(len(self.tokenizer))
            self._model = PeftModel.from_pretrained(

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -290,7 +290,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
            "seed": seed,
            **gen_kwargs,
        }
-        if "o1" in self.model:
+        if "o1" in self.model or "5" in self.model:
            output.pop("stop")
            output["temperature"] = 1
        elif "o3" in self.model:

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
@@ -28,9 +28,8 @@ class OptimumLM(HFLM):
        **kwargs,
    ) -> None:
        if "backend" in kwargs:
-            # optimum currently only supports causal models
-            assert kwargs["backend"] == "causal", (
-                "Currently, only OVModelForCausalLM is supported."
+            assert kwargs["backend"] in ["causal", "seq2seq"], (
+                "Currently, only OVModelForCausalLM or OVModelForSeq2SeqLM are supported."
            )

        self.openvino_device = device
@@ -54,7 +53,7 @@ class OptimumLM(HFLM):
                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
            )
        else:
-            from optimum.intel.openvino import OVModelForCausalLM
+            from optimum.intel.openvino import OVModelForCausalLM, OVModelForSeq2SeqLM

        model_kwargs = kwargs if kwargs else {}
        if "ov_config" in model_kwargs:
@@ -76,17 +75,14 @@ class OptimumLM(HFLM):
                model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
                    "PIPELINE_PARALLEL"
                )
-        model_file = Path(pretrained) / "openvino_model.xml"
-        if model_file.exists():
-            export = False
-        else:
-            export = True

-        self._model = OVModelForCausalLM.from_pretrained(
+        model_cls = (
+            OVModelForCausalLM if self.backend == "causal" else OVModelForSeq2SeqLM
+        )
+        self._model = model_cls.from_pretrained(
            pretrained,
            revision=revision,
            trust_remote_code=trust_remote_code,
-            export=export,
            device=self.openvino_device.upper(),
            **model_kwargs,
        )
--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
@@ -216,7 +216,7 @@ class SGLangLM(TemplateLM):
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
-        re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
+        re_ords = Collator(requests, _collate_gen, group_by=None)
        chunks = re_ords.get_batched(
            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
        )
@@ -232,36 +232,41 @@ class SGLangLM(TemplateLM):
            context_and_encoding, all_gen_kwargs = zip(*chunk)
            context, context_encoding = zip(*context_and_encoding)

-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-            # unpack our keyword arguments.
-            if isinstance(gen_kwargs, dict):
-                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                # add EOS token to stop sequences
-                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
-            else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+            context_encoding_truncated = []
+            sampling_params = []
+            for x, gen_kwargs in zip(context_encoding, all_gen_kwargs):
+                # unpack our keyword arguments.
+                if isinstance(gen_kwargs, dict):
+                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    # add EOS token to stop sequences
+                    until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+                else:
+                    raise ValueError(
+                        f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                    )
+                if "max_gen_toks" in kwargs.keys():
+                    max_gen_toks = kwargs.pop("max_gen_toks")
+                else:
+                    max_gen_toks = self.max_gen_toks
+
+                # set the max length in tokens of inputs ("context_enc")
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+                if len(x) > max_ctx_len:
+                    context_encoding_truncated.append(x[-max_ctx_len:])
+                else:
+                    context_encoding_truncated.append(x)
+                # create sampling params
+                kwargs = self.modify_gen_kwargs(kwargs)
+                sampling_params.append(
+                    kwargs | {"max_tokens": max_gen_toks, "stop": until}
                )
-            if "max_gen_toks" in kwargs.keys():
-                max_gen_toks = kwargs.pop("max_gen_toks")
-            else:
-                max_gen_toks = self.max_gen_toks
-
-            # set the max length in tokens of inputs ("context_enc")
-            # max len for inputs = max length, minus room to generate the max new tokens
-            max_ctx_len = self.max_length - max_gen_toks
-            context_encoding = [x[-max_ctx_len:] for x in context_encoding]
-
            # perform batched generation
            # cont is a list of dic. See here https://github.com/sgl-project/sglang/blob/0a6f18f068e4095fc228e798454e8496c9749214/python/sglang/srt/entrypoints/engine.py#L111 .
            cont = self._model_generate(
-                requests=context_encoding,
+                requests=context_encoding_truncated,
                generate=True,
-                max_tokens=max_gen_toks,
-                stop=until,
-                **kwargs,
+                sampling_params=sampling_params,
            )

            # cache generations
@@ -284,28 +289,22 @@ class SGLangLM(TemplateLM):
        self,
        requests: List[List[int]] = None,
        generate: bool = False,
-        max_tokens: int = None,
-        stop: Optional[List[str]] = None,
+        sampling_params: Union[List[Dict], Dict, None] = None,
        return_logprob: bool = False,
        top_logprobs_num: int = 1,
        logprob_start_len: int = -1,
-        **kwargs,
    ):
        # check sglang sampling parameters: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/sampling/sampling_params.py#L21  and https://docs.sglang.ai/references/sampling_params.html.
-        if generate:
-            kwargs = self.modify_gen_kwargs(kwargs)
-            sampling_params = {
-                "max_new_tokens": max_tokens,
-                "stop": stop,
-            }
-            sampling_params.update(kwargs)
-        else:
-            sampling_params = {
-                "temperature": 0,
-                "max_new_tokens": 1,
-            }
-            sampling_params.update(kwargs)
-
+        if not generate:
+            sampling_params = sampling_params if sampling_params else {}
+            sampling_params.update(
+                {
+                    "temperature": 0,
+                    "max_new_tokens": 1,
+                }
+            )
+        if not isinstance(sampling_params, List):
+            sampling_params = [sampling_params] * len(requests)
        # Refer to:  https://docs.sglang.ai/backend/offline_engine_api.html
        outputs = self.model.generate(
            input_ids=requests,

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -52,10 +52,10 @@ eval_logger = logging.getLogger(__name__)

 def _vllm_mp_worker(
    model_args: dict,
-    sampling_params: SamplingParams,
+    sampling_params: list["SamplingParams"],
    requests: list[list[int]],
-    lora_request: LoRARequest,
-    result_queue: Queue,
+    lora_request: "LoRARequest",
+    result_queue: "Queue",
    dp_size: int,
    local_dp_rank: int,
    dp_master_port: int,
@@ -197,6 +197,12 @@ class VLLM(TemplateLM):
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")

+        if "gemma" in pretrained.lower():
+            add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
+            )
+
        from transformers import AutoConfig

        self._config = AutoConfig.from_pretrained(
@@ -215,11 +221,6 @@ class VLLM(TemplateLM):
            "enable_thinking", enable_thinking
        )
        self.add_bos_token = add_bos_token
-        if "gemma" in pretrained.lower():
-            self.add_bos_token = True
-            eval_logger.info(
-                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
-            )

        if parse_version(version("vllm")) >= parse_version("0.8.3"):
            kwargs_resolve_hf_chat_template = {
@@ -365,17 +366,14 @@ class VLLM(TemplateLM):
        self,
        requests: list[list[int]] = None,
        generate: bool = False,
-        max_tokens: int = None,
-        stop: list[str] | None = None,
-        **kwargs,
+        sampling_params: list["SamplingParams"] | "SamplingParams" | None = None,
    ):
-        if generate:
-            kwargs = self.modify_gen_kwargs(kwargs)
-            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
-        else:
+        if not generate or sampling_params is None:
            sampling_params = SamplingParams(
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
+        if not isinstance(sampling_params, List):
+            sampling_params = [sampling_params] * len(requests)
        if self.data_parallel_size > 1 and not self.V1:
            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
@@ -383,9 +381,9 @@ class VLLM(TemplateLM):
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
-                sampling_params: SamplingParams,
+                sampling_params: list["SamplingParams"],
                requests: list[list[int]],
-                lora_request: LoRARequest,
+                lora_request: "LoRARequest",
            ):
                llm = LLM(**model_args)
                return llm.generate(
@@ -397,9 +395,12 @@ class VLLM(TemplateLM):
            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
            # interleaved important to balance context lengths across workers
            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
+            sampling_params = [
+                list(sp) for sp in distribute(self.data_parallel_size, sampling_params)
+            ]
            inputs = (
-                (self.model_args, sampling_params, req, self.lora_request)
-                for req in requests
+                (self.model_args, sp, req, self.lora_request)
+                for req, sp in zip(requests, sampling_params)
            )
            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
            results = ray.get(object_refs)
@@ -414,16 +415,18 @@ class VLLM(TemplateLM):
            dp_master_port = os.environ.get("VLLM_DP_MASTER_PORT") or get_open_port()

            requests = (list(x) for x in distribute(self.data_parallel_size, requests))
-
+            sampling_params = (
+                list(sp) for sp in distribute(self.data_parallel_size, sampling_params)
+            )
            procs, resq = [], Queue()
            # We use Process as it is non-daemonic
            try:
-                for rank, req in enumerate(requests):
+                for rank, (sp, req) in enumerate(zip(requests, sampling_params)):
                    proc = Process(
                        target=_vllm_mp_worker,
                        args=(
                            self.model_args.copy(),
-                            sampling_params,
+                            sp,
                            req,
                            self.lora_request,
                            resq,
@@ -577,10 +580,11 @@ class VLLM(TemplateLM):
            # - any OOMs will happen right away rather than near the end
            return -len(_requests[0][1]), _requests[0][0]

-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
+        re_ords = Collator(
+            requests,
+            _collate_gen,
+            group_by=None,
+        )
        chunks = re_ords.get_batched(
            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
        )
@@ -595,41 +599,44 @@ class VLLM(TemplateLM):
        for chunk in chunks:
            context_and_encoding, all_gen_kwargs = zip(*chunk)
            context, context_encoding = zip(*context_and_encoding)
-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-            # unpack our keyword arguments.
-            if isinstance(gen_kwargs, dict):
-                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                # add EOS token to stop sequences
-                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
-            else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
-                )
-            if "max_gen_toks" in kwargs:
-                max_gen_toks = kwargs.pop("max_gen_toks")
-            else:
-                max_gen_toks = self.max_gen_toks
-
-            # set the max length in tokens of inputs ("context_enc")
-            # max len for inputs = max length, minus room to generate the max new tokens
-            max_ctx_len = self.max_length - max_gen_toks
-            all_lengths = [len(x) for x in context_encoding]
-            for length in all_lengths:
-                if length > max_ctx_len:
+            context_encoding_truncated = []
+            sampling_params = []
+            for x, gen_kwargs in zip(context_encoding, all_gen_kwargs):
+                # unpack our keyword arguments.
+                if isinstance(gen_kwargs, dict):
+                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    # add EOS token to stop sequences
+                    until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+                else:
+                    raise ValueError(
+                        f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                    )
+                if "max_gen_toks" in kwargs.keys():
+                    max_gen_toks = kwargs.pop("max_gen_toks")
+                else:
+                    max_gen_toks = self.max_gen_toks
+
+                # set the max length in tokens of inputs ("context_enc")
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+                if len(x) > max_ctx_len:
                    eval_logger.warning(
-                        f"Context length {length} exceeds max length (context + max gen tokens): {max_ctx_len}. Truncating context."
+                        f"Context length {len(x)} exceeds max length (context + max gen tokens): {max_ctx_len}. Truncating context."
                    )
-            context_encoding = [x[-max_ctx_len:] for x in context_encoding]
+                    context_encoding_truncated.append(x[-max_ctx_len:])
+                else:
+                    context_encoding_truncated.append(x)
+                # create sampling params
+                kwargs = self.modify_gen_kwargs(kwargs)
+                sampling_params.append(
+                    SamplingParams(max_tokens=max_gen_toks, stop=until, **kwargs)
+                )

            # perform batched generation
            cont = self._model_generate(
-                requests=context_encoding,
+                requests=context_encoding_truncated,
                generate=True,
-                max_tokens=max_gen_toks,
-                stop=until,
-                **kwargs,
+                sampling_params=sampling_params,
            )

            # cache generations
@@ -674,7 +681,7 @@ class VLLM(TemplateLM):
        for chunk in chunks:
            inputs = []
            ctxlens = []
-            for _cache_key, context_enc, continuation_enc in chunk:
+            for cache_key, context_enc, continuation_enc in chunk:
                if (
                    full_length := len(context_enc + continuation_enc)
                ) > self.max_length:

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
--- a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_1
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_2
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_3
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_4
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yaml
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yaml
@@ -2,7 +2,6 @@ tag:
 - adr_tasks
 - adr_prompt_5
 dataset_path: masakhane/diacritics-restoration
-dataset_kwargs: {trust_remote_code: True}
 doc_to_target: target
 output_type: generate_until
 fewshot_split: dev

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti
@@ -4,7 +4,6 @@ tag:
 task: null
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisent_prompt_2
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisenti_prompt_3
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisenti_prompt_4
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti
@@ -3,7 +3,6 @@ tag:
    - afrisenti_prompt_5
 dataset_path: masakhane/afrisenti
 dataset_name: null
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/masakhapos/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/utils.py
 def doc_to_text(doc):
    output = """Please provide the POS tags for each word in the input sentence. The input will be a list of words in
    the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text
-    and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ, "DET", "INTJ",
+    and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ",
    "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT" "SCONJ", "SYM", "VERB", "X"]. \nYour response should include only a
    list of tuples, in the order that the words appear in the input sentence, with each tuple containing the
    corresponding POS tag label for a word.

--- a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti
@@ -2,7 +2,6 @@ tag:
    - afrobench_sentiment_tasks
    - nollysenti_prompt_1
 dataset_path: Davlan/nollysenti
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test