Merge branch 'main' into metrics

# Conflicts: # tests/test_tasks.py

Merge branch 'main' into metrics
# Conflicts: # tests/test_tasks.py
bd028848 · Baber · 6e48110e · 56def33d · bd028848 · bd028848
Commit bd028848 authored Jul 18, 2025 by Baber
20 changed files
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 ---
 ## Latest News 📣
+- [2025/07] Added `think_end_token` arg to `hf` (token/str), `vllm` and `sglang` (str) for stripping CoT reasoning traces from models that support it.
 - [2025/03] Added support for steering HF models!
 - [2025/02] Added [SGLang](https://docs.sglang.ai/) support!
 - [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.

--- a/docs/API_guide.md
+++ b/docs/API_guide.md
@@ -21,7 +21,11 @@ When subclassing `TemplateAPI`, you need to implement the following methods:
 1. `_create_payload`: Creates the JSON payload for API requests.
 2. `parse_logprobs`: Parses log probabilities from API responses.
 3. `parse_generations`: Parses generated text from API responses.
-4. `headers`: Returns the headers for the API request.
+Optional Properties:
+4. `header`: Returns the headers for the API request.
+5. `api_key`: Returns the API key for authentication (if required).
 You may also need to override other methods or properties depending on your API's specific requirements.
@@ -97,6 +101,10 @@ When initializing a `TemplateAPI` instance or a subclass, you can provide severa
  - Whether to validate the certificate of the API endpoint (if HTTPS).
  - Default is True.
+- `header` (dict, optional):
+  - Custom headers for API requests.
+  - If not provided, uses `{"Authorization": f"Bearer {self.api_key}"}` by default.
 Example usage:
 ```python

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -436,7 +436,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
-        args.model_args = args.model_args + ",trust_remote_code=True"
+        if isinstance(args.model_args, dict):
+            args.model_args["trust_remote_code"] = True
+        else:
+            args.model_args = args.model_args + ",trust_remote_code=True"
    (
        eval_logger.info(f"Selected Tasks: {task_names}")
        if eval_logger.getEffectiveLevel() >= logging.INFO

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -510,7 +510,6 @@ def bootstrap_stderr(
    if not os.getenv("DISABLE_MULTIPROC"):
        import multiprocessing as mp
-        pool = mp.Pool(mp.cpu_count())
        # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
        # equivalent to stderr calculated without Bessel's correction in the stddev.
        # Unfortunately, I haven't been able to figure out what the right correction is
@@ -522,17 +521,16 @@ def bootstrap_stderr(
        from tqdm import tqdm
        print("bootstrapping for stddev:", f.__name__)
-        for bootstrap in tqdm(
+        with mp.Pool(mp.cpu_count()) as pool:
-            pool.imap(
+            for bootstrap in tqdm(
-                _bootstrap_internal(f, chunk_size),
+                pool.imap(
-                [(i, xs) for i in range(iters // chunk_size)],
+                    _bootstrap_internal(f, chunk_size),
-            ),
+                    [(i, xs) for i in range(iters // chunk_size)],
-            total=iters // chunk_size,
+                ),
-        ):
+                total=iters // chunk_size,
-            # sample w replacement
+            ):
-            res.extend(bootstrap)
+                # sample w replacement
+                res.extend(bootstrap)
-        pool.close()
    else:
        res = _bootstrap_internal_no_mp(f, xs, iters)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -154,15 +154,23 @@ def simple_evaluate(
            "Either 'limit' or 'samples' must be None, but both are not None."
        )
+    _NEEDS_CHAT_TEMPLATE = ("inst", "chat")
    if (
-        (isinstance(model_args, str) and "inst" in model_args.lower())
+        (
+            isinstance(model_args, str)
+            and any(kw in model_args.lower() for kw in _NEEDS_CHAT_TEMPLATE)
+        )
        or (
            isinstance(model_args, dict)
-            and any("inst" in str(v).lower() for v in model_args.values())
+            and any(
+                any(kw in str(v).lower() for kw in _NEEDS_CHAT_TEMPLATE)
+                for v in model_args.values()
+            )
        )
    ) and not apply_chat_template:
        eval_logger.warning(
-            "Model appears to be an instruct variant but chat template is not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
+            "Model appears to be an instruct or chat variant but chat template is not applied. "
+            "Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
        )
    if delete_requests_cache:
@@ -752,6 +760,7 @@ def evaluate(
            samples = (
                hash_dict_images(samples)
                if os.environ.get("LMEVAL_HASHMM", "1") != "0"
+                and (hasattr(lm, "MULTIMODAL"))
                else samples
            )
            results_dict["samples"] = dict(samples)

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -145,7 +145,7 @@ class MultiChoiceRegexFilter(RegexFilter):
        """
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
-                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+                        - step 2 : We parse the choice with regex: r's*([A-?])', where ? varies by number of choices.
        group_select: Selects the (group_select)th match from the findall result.
        ignore_case: Ignores the case during step 1 matching
        ignore_punctuation: Remove the punctuation during step 1 matching

--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -135,6 +135,7 @@ class TemplateAPI(TemplateLM):
        eos_string: str = None,
        # timeout in seconds
        timeout: int = 300,
+        header: Optional[Dict[str, str]] = None,
        max_images: int = 1,
        **kwargs,
    ) -> None:
@@ -152,6 +153,7 @@ class TemplateAPI(TemplateLM):
        self.model = model or pretrained
        self.base_url = base_url
        self.tokenizer = tokenizer
+        self._header = header
        if not isinstance(batch_size, int) and "auto" in batch_size:
            eval_logger.warning(
                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
@@ -296,7 +298,7 @@ class TemplateAPI(TemplateLM):
    @cached_property
    def header(self) -> dict:
        """Override this property to return the headers for the API request."""
-        return {"Authorization": f"Bearer {self.api_key}"}
+        return self._header or {"Authorization": f"Bearer {self.api_key}"}
    @property
    def tokenizer_name(self) -> str:
@@ -447,6 +449,7 @@ class TemplateAPI(TemplateLM):
    async def amodel_call(
        self,
        session: ClientSession,
+        sem: asyncio.Semaphore,
        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
        *,
        generate: bool = True,
@@ -465,6 +468,7 @@ class TemplateAPI(TemplateLM):
            **kwargs,
        )
        cache_method = "generate_until" if generate else "loglikelihood"
+        acquired = await sem.acquire()
        try:
            async with session.post(
                self.base_url,
@@ -474,7 +478,8 @@ class TemplateAPI(TemplateLM):
                if not response.ok:
                    error_text = await response.text()
                    eval_logger.warning(
-                        f"API request failed with error message: {error_text}. Retrying..."
+                        f"API request failed! Status code: {response.status}, "
+                        f"Response text: {error_text}. Retrying..."
                    )
                # raising exception will retry the request
                response.raise_for_status()
@@ -495,11 +500,12 @@ class TemplateAPI(TemplateLM):
                    self.cache_hook.add_partial(cache_method, cache, res)
            return answers
        # If the retries also fail
-        except RetryError:
+        except BaseException as e:
-            eval_logger.error(
+            eval_logger.error(f"Exception:{repr(e)}, {outputs}, retrying.")
-                "API request failed after multiple retries. Please check the API status."
+            raise e
-            )
+        finally:
-            return None
+            if acquired:
+                sem.release()
    def batch_loglikelihood_requests(
        self, chunks: Iterable[List[LogLikelihoodInputs]]
@@ -535,6 +541,7 @@ class TemplateAPI(TemplateLM):
    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
        ctxlens = ctxlens if ctxlens else [None] * len(requests)
        conn = TCPConnector(limit=self._concurrent, ssl=self.verify_certificate)
+        sem = asyncio.Semaphore(self._concurrent)
        async with ClientSession(
            connector=conn, timeout=ClientTimeout(total=self.timeout)
        ) as session:
@@ -542,12 +549,16 @@ class TemplateAPI(TemplateLM):
                stop=stop_after_attempt(self.max_retries),
                wait=wait_exponential(multiplier=0.5, min=1, max=10),
                reraise=True,
+                before_sleep=lambda retry_state: eval_logger.info(
+                    f"Retry attempt {retry_state.attempt_number}"
+                ),
            )(self.amodel_call)
            # Create tasks for each batch of request
            tasks = [
                asyncio.create_task(
                    retry_(
                        session=session,
+                        sem=sem,
                        messages=message,
                        cache_keys=cache_key,
                        generate=generate,

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -34,6 +34,7 @@ from lm_eval.models.utils import (
    get_dtype,
    handle_stop_sequences,
    pad_and_concat,
+    postprocess_generated_text,
    stop_sequences_criteria,
 )
@@ -76,6 +77,7 @@ class HFLM(TemplateLM):
        device: Optional[str] = "cuda",
        dtype: Optional[Union[str, torch.dtype]] = "auto",
        softmax_dtype: Optional[Union[str, torch.dtype]] = None,
+        mixed_precision_dtype: Optional[Union[str, torch.dtype]] = None,
        batch_size: Optional[Union[int, str]] = 1,
        max_batch_size: Optional[int] = 64,
        trust_remote_code: Optional[bool] = False,
@@ -94,6 +96,9 @@ class HFLM(TemplateLM):
        autogptq: Optional[Union[bool, str]] = False,
        gptqmodel: Optional[bool] = False,
        gguf_file: Optional[str] = None,
+        # end token for thinking, either the string or int token id.
+        # splits to get response after this token (if provided).
+        think_end_token: Union[str, int, None] = None,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -223,6 +228,11 @@ class HFLM(TemplateLM):
            self.model.eval()
            self.model.tie_weights()
+        self.think_end_token = (
+            int(think_end_token)
+            if (isinstance(think_end_token, str) and think_end_token.isdigit())
+            else think_end_token
+        )
        self.truncation = truncation
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
@@ -247,6 +257,11 @@ class HFLM(TemplateLM):
        self.softmax_dtype = (
            get_dtype(softmax_dtype) if softmax_dtype is not None else None
        )
+        self.mixed_precision_dtype = (
+            get_dtype(mixed_precision_dtype)
+            if mixed_precision_dtype is not None
+            else None
+        )
        if str(batch_size).startswith("auto"):
            batch_size = batch_size.split(":")
@@ -903,18 +918,23 @@ class HFLM(TemplateLM):
        logits returned from the model's decoder
        """
        with torch.no_grad():
-            if attn_mask is not None or labels is not None:
+            with torch.autocast(
-                assert attn_mask is not None and labels is not None
+                device_type=self.device.type,
-                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                dtype=self.mixed_precision_dtype,
-                return self.model(
+                enabled=self.mixed_precision_dtype is not None,
-                    input_ids=inps, attention_mask=attn_mask, labels=labels
+            ):
-                ).logits
+                if attn_mask is not None or labels is not None:
-            else:
+                    assert attn_mask is not None and labels is not None
-                assert self.AUTO_MODEL_CLASS in (
+                    assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
-                    transformers.AutoModelForCausalLM,
+                    return self.model(
-                    transformers.AutoModelForVision2Seq,
+                        input_ids=inps, attention_mask=attn_mask, labels=labels
-                )
+                    ).logits
-                return self.model(inps).logits
+                else:
+                    assert self.AUTO_MODEL_CLASS in (
+                        transformers.AutoModelForCausalLM,
+                        transformers.AutoModelForVision2Seq,
+                    )
+                    return self.model(inps).logits
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
        # temperature = 0.0 if not set
@@ -934,14 +954,19 @@ class HFLM(TemplateLM):
        stopping_criteria = stop_sequences_criteria(
            self.tokenizer, stop, context.shape[1], context.shape[0]
        )
-        return self.model.generate(
+        with torch.autocast(
-            input_ids=context,
+            device_type=self.device.type,
-            max_length=max_length,
+            dtype=self.mixed_precision_dtype,
-            stopping_criteria=stopping_criteria,
+            enabled=self.mixed_precision_dtype is not None,
-            pad_token_id=self.tokenizer.pad_token_id,
+        ):
-            use_cache=True,
+            return self.model.generate(
-            **generation_kwargs,
+                input_ids=context,
-        )
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+                **generation_kwargs,
+            )
    def _select_cont_toks(
        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
@@ -1411,15 +1436,30 @@ class HFLM(TemplateLM):
                if self.backend == "causal":
                    cont_toks = cont_toks[context_enc.shape[1] :]
-                s = self.tok_decode(cont_toks)
+                # Handle integer think_end_token: find last occurrence and strip tokens after it
+                if isinstance(self.think_end_token, int):
+                    think_token_indices = [
+                        i
+                        for i, token in enumerate(cont_toks)
+                        if token == self.think_end_token
+                    ]
+                    if think_token_indices:
+                        cont_toks = cont_toks[think_token_indices[-1] + 1 :]
-                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                s = self.tok_decode(cont_toks)
-                for term in until:
-                    if len(term) > 0:
-                        # ignore '' separator,
-                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                        s = s.split(term)[0]
+                # Strip leading whitespace if we removed thinking tokens
+                if isinstance(self.think_end_token, int):
+                    s = s.lstrip()
+                # Apply post-processing: remove stop sequences and string-based thinking tokens
+                s = postprocess_generated_text(
+                    generation=s,
+                    stop=until,
+                    think_end_token=self.think_end_token
+                    if isinstance(self.think_end_token, str)
+                    else None,
+                )
                res.append(s)
                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -16,8 +16,8 @@ eval_logger = logging.getLogger(__name__)
 class LocalCompletionsAPI(TemplateAPI):
    def __init__(
        self,
-        base_url=None,
+        base_url: str = None,
-        tokenizer_backend="huggingface",
+        tokenizer_backend: str = "huggingface",
        **kwargs,
    ):
        super().__init__(
@@ -108,9 +108,9 @@ class LocalCompletionsAPI(TemplateAPI):
 class LocalChatCompletion(LocalCompletionsAPI):
    def __init__(
        self,
-        base_url=None,
+        base_url: str = None,
-        tokenizer_backend=None,
+        tokenizer_backend: str = None,
-        tokenized_requests=False,
+        tokenized_requests: bool = False,
        **kwargs,
    ):
        eval_logger.warning(
@@ -236,6 +236,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
            eval_logger.warning(
                "o1 models do not support `stop` and only support temperature=1"
            )
        super().__init__(
            base_url=base_url,
            tokenizer_backend=tokenizer_backend,

--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
@@ -11,6 +11,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
    handle_stop_sequences,
+    postprocess_generated_text,
 )
 from lm_eval.utils import (
    get_rolling_token_windows,
@@ -59,6 +60,8 @@ class SGLangLM(TemplateLM):
        dp_size: int = 1,
        tp_size: int = 1,
        prefix_token_id: Optional[int] = None,
+        # End marker for thinking tags - splits to get response after this token (if provided).
+        think_end_token: Optional[str] = None,
        **kwargs,
    ):
        super().__init__()
@@ -74,6 +77,7 @@ class SGLangLM(TemplateLM):
            "Either context_length or max_model_len may be provided, but not both"
        )
        # Initialize your sglang model here
+        self.think_end_token = think_end_token
        self._max_length = (
            max_model_len if max_model_len is not None else context_length
        )
@@ -263,6 +267,9 @@ class SGLangLM(TemplateLM):
            # cache generations
            for output, context in zip(cont, context):
                generated_text = output.get("text", "")
+                generated_text = postprocess_generated_text(
+                    generated_text, until, self.think_end_token
+                )
                res.append(generated_text)
                self.cache_hook.add_partial(
                    "generate_until", (context, gen_kwargs), generated_text

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -852,3 +852,32 @@ def truncate_tokens(
        right_length = max_length - left_length
        return tokens[:left_length] + tokens[-right_length:]
    return None
+def postprocess_generated_text(
+    generation: str, stop: Union[list[str], str, None], think_end_token: Optional[str]
+) -> str:
+    """
+    Post-processes the generated text by stripping stop sequences and optional thinking markers.
+    Args:
+        generation (str): The generated text to be processed.
+        stop (Optional[list[str]]): Stop sequence(s) to remove. Text is truncated
+            at the first occurrence of any stop sequence.
+        think_end_token (Optional[str]): Token marking end of thinking section. If provided,
+            returns only the text after this token (discarding thinking content).
+    Returns:
+        str: The processed generation - text before stop sequences and after thinking sections.
+    """
+    if stop:
+        stop = [stop] if isinstance(stop, str) else stop
+        for term in stop:
+            if len(term) > 0:
+                # ignore '' separator,
+                # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                generation = generation.split(term)[0]
+    if think_end_token:
+        generation = generation.split(think_end_token)[-1].lstrip()
+    return generation
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -22,6 +22,7 @@ from lm_eval.models.utils import (
    Collator,
    configure_pad_token,
    handle_stop_sequences,
+    postprocess_generated_text,
    undistribute,
 )
 from lm_eval.utils import (
@@ -133,7 +134,11 @@ class VLLM(TemplateLM):
        device: str = "cuda",
        data_parallel_size: int = 1,
        lora_local_path: str = None,
-        enable_thinking: bool = False,
+        # VLLM: enable thinking tags in the prompt.
+        enable_thinking: bool = True,
+        # End marker for thinking tags - splits to get response after this token (if provided).
+        think_end_token: Optional[str] = None,
+        max_lora_rank: int = 16,
        **kwargs,
    ):
        super().__init__()
@@ -147,6 +152,7 @@ class VLLM(TemplateLM):
        assert max_length is None or max_model_len is None, (
            "Either max_length or max_model_len may be provided, but not both"
        )
+        self.think_end_token = think_end_token
        self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0"
        self._max_length = max_model_len if max_model_len is not None else max_length
        self.tensor_parallel_size = int(tensor_parallel_size)
@@ -167,6 +173,8 @@ class VLLM(TemplateLM):
            "quantization": quantization,
            "seed": int(seed),
            "device": str(device),
+            "enable_lora": True if lora_local_path else False,
+            "max_lora_rank": int(max_lora_rank),
        }
        self.model_args.update(kwargs)
        self.batch_size = (
@@ -627,11 +635,11 @@ class VLLM(TemplateLM):
            # cache generations
            for output, context in zip(cont, context):
-                generated_text = output.outputs[0].text
+                generated_text: str = output.outputs[0].text
                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                for term in until:
+                generated_text = postprocess_generated_text(
-                    if len(term) > 0:
+                    generated_text, until, self.think_end_token
-                        generated_text = generated_text.split(term)[0]
+                )
                res.append(generated_text)
                self.cache_hook.add_partial(
                    "generate_until", (context, gen_kwargs), generated_text

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -45,7 +45,9 @@
 | [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) |
 | [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) |
 | [darijammlu](darijammlu/README.md)| Multiple-choice QA in Moroccan Darija (an Arabic dialect).  | Moroccan Darija (MT) |
-| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [egyhellaswag](egyhellaswag/README.md)                                   | Egyptian Arabic (Masri) version of HellaSwag.                                                                                                                                                                                                                                                                                          | Egyptian Arabic (MT)                                                                                                  |
+| [egymmlu](egymmlu/README.md)                                            | Multiple-choice QA in Egyptian Arabic.                                                                                                                                                                                                                                                                                                 | Egyptian Arabic (MT)                                                                                                  |
 | [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                               |
 | [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                |
 | [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                |

--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
@@ -53,4 +53,7 @@ None.
 - [ ] Majority voting "without CoT"
 ### Changelog
-no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
+- no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
+- `bbh_cot_fewshot` v.4.0; 2025-07-14:
+  - PR #3140. Removed duplicate "Let's think step by step" from the fewshots.
+  - set target_delimiter to "" as the fewshot samples end with a newline character.
--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -2,6 +2,7 @@ dataset_path: SaylorTwift/bbh
 output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
+target_delimiter: ""
 metric_list:
  - metric: exact_match
    aggregation: mean
@@ -24,4 +25,4 @@ filter_list:
      - function: "take_first"
 num_fewshot: 3
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/bbh/cot_fewshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/causal_judgement.yaml
@@ -26,9 +26,7 @@ fewshot_config:
      - Yes
      - No'
-    target: 'Let''s think step by step.
+    target: 'Here in this question, we are told that "Frank T. had no experience with guns,
-      Here in this question, we are told that "Frank T. had no experience with guns,
      his hand slipped on the barrel of the gun, and the shot went wild." A typical
      person would assume that this passage suggests that Frank T. had no intention
      of shooting and injuring someone and that the bullet accidentally hit the neighbor''s
@@ -50,9 +48,7 @@ fewshot_config:
      - Yes
      - No'
-    target: 'Let''s think step by step.
+    target: 'Here in this question, we are told that the boss ordered them both to arrive
-      Here in this question, we are told that the boss ordered them both to arrive
      at the meeting room at the same time and that the motion detector was set up
      to be triggered if at least one person appeared in the room at the same time."
      A typical person would assume that the person probably meant to say the detector
@@ -82,9 +78,7 @@ fewshot_config:
      - Yes
      - No'
-    target: 'Let''s think step by step.
+    target: 'Here in this question, we are told that "He aims the dart at the low point region."
-      Here in this question, we are told that "He aims the dart at the low point region."
      A typical person might therefore think George did intentionally hit the low
      point region, because he wanted to lift up the spirit of his sister Lena. So
      the answer is Yes.'

--- a/lm_eval/tasks/bbh/cot_fewshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/date_understanding.yaml
@@ -26,9 +26,7 @@ fewshot_config:
      (E) 07/14/1938
      (F) 12/14/1988'
-    target: 'Let''s think step by step.
+    target: 'If today is Christmas Eve of 1937, then today''s date is December 24, 1937.
-      If today is Christmas Eve of 1937, then today''s date is December 24, 1937.
      10 days before today is December 14, 1937, that is 12/14/1937. So the answer
      is (D).'
  - input: 'Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?
@@ -44,9 +42,7 @@ fewshot_config:
      (D) 11/02/2018
      (E) 11/04/2018'
-    target: 'Let''s think step by step.
+    target: 'If tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from
-      If tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from
      today is 11/11/2018. So the answer is (B).'
  - input: 'Jane and John married on Jan 2, 1958. It is their 5-year anniversary today.
      What is the date tomorrow in MM/DD/YYYY?
@@ -64,9 +60,7 @@ fewshot_config:
      (E) 01/03/1982
      (F) 12/03/1960'
-    target: 'Let''s think step by step.
+    target: 'If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary
-      If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary
      today, then today''s date is Jan 2, 1963. The date tomorrow is Jan 3, 1963,
      that is 01/03/1963. So the answer is (B).'
 include: _cot_fewshot_template_yaml

--- a/lm_eval/tasks/bbh/cot_fewshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/disambiguation_qa.yaml
@@ -23,9 +23,7 @@ fewshot_config:
      (B) The counselor took the day off
      (C) Ambiguous'
-    target: 'Let''s think step by step.
+    target: 'Here we need to determine who the pronoun "they" might be referring to. There
-      Here we need to determine who the pronoun "they" might be referring to. There
      are two possible referents for "they", namely the chief and the counselor. The
      verb "told" might be able to help us determine which one is more likely (if
      either). Let X be the chief and Y the counselor. The sentence is then of the
@@ -54,9 +52,7 @@ fewshot_config:
      (B) The manager didn''t reply yet
      (C) Ambiguous'
-    target: 'Let''s think step by step.
+    target: 'Here we need to determine who the pronoun "he" might be referring to. There
-      Here we need to determine who the pronoun "he" might be referring to. There
      are two possible referents for "he", namely the manager and the secretary. The
      verbs "sent" and "reply" might be able to help us determine which one is more
      likely (if either). Let X be the manager and Y the secretary. The sentence is
@@ -84,9 +80,7 @@ fewshot_config:
      (B) It will be the director''s office
      (C) Ambiguous'
-    target: 'Let''s think step by step.
+    target: 'Here we need to determine who the pronoun "his" might be referring to. There
-      Here we need to determine who the pronoun "his" might be referring to. There
      are two possible referents for "his", namely Bailey''s and the director''s.
      The verb phrase "plan to meet" might be able to help us determine which one
      is more likely (if either). Let X be Bailey and Y the director. The sentence

--- a/lm_eval/tasks/bbh/cot_fewshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/dyck_languages.yaml
@@ -13,9 +13,7 @@ fewshot_config:
  samples:
  - input: 'Complete the rest of the sequence, making sure that the parentheses are
      closed properly. Input: [ { ['
-    target: 'Let''s think step by step.
+    target: 'We should process each input one by one and keep track of the stack configuration.
-      We should process each input one by one and keep track of the stack configuration.
      0: empty stack
@@ -32,9 +30,7 @@ fewshot_config:
      So, we need "]", "}", "]". So the answer is ] } ].'
  - input: 'Complete the rest of the sequence, making sure that the parentheses are
      closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]'
-    target: 'Let''s think step by step.
+    target: 'We should process each input one by one and keep track of the stack configuration.
-      We should process each input one by one and keep track of the stack configuration.
      0: empty stack
@@ -76,9 +72,7 @@ fewshot_config:
  - input: 'Complete the rest of the sequence, making sure that the parentheses are
      closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < >
      ] > }'
-    target: 'Let''s think step by step.
+    target: 'We should process each input one by one and keep track of the stack configuration.
-      We should process each input one by one and keep track of the stack configuration.
      0: empty stack

--- a/lm_eval/tasks/bbh/cot_fewshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/formal_fallacies.yaml
@@ -25,7 +25,7 @@ fewshot_config:
      - valid
      - invalid'
-    target: "Let's think step by step.\n(1) Lesley is a close friend of Fernando:\
+    target: "(1) Lesley is a close friend of Fernando:\
      \ Lesley = friend(Fernando).\n(2) Being a close friend of Fernando or a schoolmate\
      \ of Lowell is sufficient for being a great-grandfather of Leroy: If X = friend(Fernando)\
      \ OR SCHOOLMATE(Lowell), then X = great-grandfather(Leroy).\nHypothesis: Does\
@@ -49,7 +49,7 @@ fewshot_config:
      - valid
      - invalid'
-    target: "Let's think step by step.\n(1) Whoever is not a great-grandfather of\
+    target: "(1) Whoever is not a great-grandfather of\
      \ Clyde is a stepbrother of Brian: If X = NOT (great-grandfather(Clyde)), then\
      \ X = stepbrother(Brian).\n(2): Being an ancestor of Dana is sufficient for\
      \ not being a great-grandfather of Clyde: If X = ancestor(Dana), X = NOT (great-grandfather(Clyde)).\n\
@@ -78,7 +78,7 @@ fewshot_config:
      - valid
      - invalid'
-    target: "Let's think step by step.\n(1) Every infrequent user of Paul Mitchell\
+    target: "(1) Every infrequent user of Paul Mitchell\
      \ shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress\
      \ soap, or both: If X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin)\
      \ OR X = loyal-buyer(Caress).\n(2): No regular consumer of Lush soap is a rare\