Fix chat template; fix leaderboard math (#2475)

* batch commit * :Revert "batch commit" This reverts commit d859d1ca . * batch commit * checkout from main * checkout from main * checkout from main * checkout from main * checkout from main * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * Chat template fix (#7) * cleanup * cleanup * cleanup * linting * fix tests * add ifeval install to new_task CI * Revert "add ifeval install to new_task CI" This reverts commit 1d19449bb7fbfa05d51e7cd20950475eae533bf1. * adds leaderboard tasks (#1) * adds leaderboard tasks * Delete lm_eval/tasks/leaderboard/leaderboard_chat_template.yaml * add readme * Delete lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro_chat_template.yaml * modify readme * fix bbh task * fix bbh salient task * modify the readme * Delete lm_eval/tasks/leaderboard/ifeval/README.md * Delete lm_eval/tasks/leaderboard/math/README.md * add leaderboard to the tasks repertory * add anouncment about new leaderbaord tasks * linting * Update README.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * installs ifeval dependency in new_task github workflow --------- Co-authored-by: Nathan Habib <nathan.habib@huggingface.com> Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * fix math parser * fix math parser * fix version * add warning about chat template --------- Co-authored-by: Nathan Habib <nathan.habib@huggingface.co> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Nathan Habib <nathan.habib@huggingface.com> Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Co-authored-by: Nathan Habib <nathan.habib19@gmail.com>

Fix chat template; fix leaderboard math (#2475)
* batch commit * :Revert "batch commit" This reverts commit d859d1ca . * batch commit * checkout from main * checkout from main * checkout from main * checkout from main * checkout from main * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * Chat template fix (#7) * cleanup * cleanup * cleanup * linting * fix tests * add ifeval install to new_task CI * Revert "add ifeval install to new_task CI" This reverts commit 1d19449bb7fbfa05d51e7cd20950475eae533bf1. * adds leaderboard tasks (#1) * adds leaderboard tasks * Delete lm_eval/tasks/leaderboard/leaderboard_chat_template.yaml * add readme * Delete lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro_chat_template.yaml * modify readme * fix bbh task * fix bbh salient task * modify the readme * Delete lm_eval/tasks/leaderboard/ifeval/README.md * Delete lm_eval/tasks/leaderboard/math/README.md * add leaderboard to the tasks repertory * add anouncment about new leaderbaord tasks * linting * Update README.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * installs ifeval dependency in new_task github workflow --------- Co-authored-by: Nathan Habib <nathan.habib@huggingface.com> Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * fix math parser * fix math parser * fix version * add warning about chat template --------- Co-authored-by: Nathan Habib <nathan.habib@huggingface.co> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Nathan Habib <nathan.habib@huggingface.com> Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Co-authored-by: Nathan Habib <nathan.habib19@gmail.com>
77c811ea · Baber Abbasi · GitHub · bd80a6c0 · 77c811ea · 77c811ea
Unverified Commit 77c811ea authored Nov 11, 2024 by Baber Abbasi Committed by GitHub Nov 11, 2024
6 changed files
--- a/docs/chat-template-readme.md
+++ b/docs/chat-template-readme.md
+# Chat Template Delimiter Handling Update
+
+## Overview
+This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.
+
+## Background
+By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
+```
+doc_to_text(doc) + target_delimiter + doc_to_target(doc)
+```
+
+While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.
+
+## The Change
+- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
+- This prevents interference between chat template formatting and the default delimiter system
+- Particularly important for multiple choice tasks where the template itself handles spacing
+
+## Example
+```
+# Before (with default delimiter " ")
+<user>Question: What color is the sky?\nAnswer:<assistant> blue
+
+# After
+<user>Question: What color is the sky?\nAnswer:<assistant>blue
+```
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -449,6 +449,7 @@ class Task(abc.ABC):
                doc=doc,
                ctx=fewshot_ctx,
                metadata=(self.config["task"], doc_id, self.config.repeats),
+                apply_chat_template=apply_chat_template,
            )

            if not isinstance(inst, list):
@@ -1301,6 +1302,8 @@ class ConfigurableTask(Task):
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
+
        aux_arguments = None

        if self.OUTPUT_TYPE == "loglikelihood":
@@ -1310,6 +1313,8 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "multiple_choice":
            choices = self.doc_to_choice(doc)
            target_delimiter = self.config.target_delimiter
+            if apply_chat_template:
+                target_delimiter = ""
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
                cont = self.doc_to_target(doc)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -400,6 +400,11 @@ def evaluate(

    eval_logger.setLevel(getattr(logging, f"{verbosity}"))

+    if apply_chat_template:
+        eval_logger.warning(
+            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
+        )
+
    # tracks all Instances/requests a model must generate output on.
    requests = defaultdict(list)
    # stores the amount to pad out reqs per req. type so that

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -4,6 +4,7 @@ from datetime import timedelta
 from pathlib import Path
 from typing import Dict, List, Literal, Optional, Tuple, Union

+import jinja2
 import torch
 import torch.nn.functional as F
 import transformers
@@ -1344,9 +1345,20 @@ class HFLM(TemplateLM):
        """
        Method to apply a chat template to a list of chat history between user and model.
        """
-        return self.tokenizer.apply_chat_template(
-            chat_history, tokenize=False, add_generation_prompt=True
-        )
+        try:
+            chat_templated = self.tokenizer.apply_chat_template(
+                chat_history, tokenize=False, add_generation_prompt=True
+            )
+        except jinja2.exceptions.TemplateError:
+            eval_logger.warning(
+                "Failed to apply chat template. removing the system role in chat history."
+            )
+            chat_history = [msg for msg in chat_history if msg["role"] != "system"]
+            chat_templated = self.tokenizer.apply_chat_template(
+                chat_history, tokenize=False, add_generation_prompt=True
+            )
+
+        return chat_templated

    def get_model_info(self) -> dict:
        """

--- a/lm_eval/tasks/leaderboard/math/_template_yaml
+++ b/lm_eval/tasks/leaderboard/math/_template_yaml
@@ -18,7 +18,7 @@ metric_list:
    higher_is_better: true
 num_fewshot: 4
 metadata:
-  version: 1.0
+  version: 2.0
 dataset_kwargs:
  trust_remote_code: true
 fewshot_config:

--- a/lm_eval/tasks/leaderboard/math/utils.py
+++ b/lm_eval/tasks/leaderboard/math/utils.py
@@ -17,6 +17,9 @@ please install sympy via pip install lm-eval[math] or pip install -e .[math]",
    )


+INVALID_ANSWER = "[invalidanswer]"
+
+
 # taken from
 # https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
 def doc_to_text(doc: dict) -> str:
@@ -70,7 +73,10 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
    unnormalized_answer = get_unnormalized_answer(candidates)
    answer = normalize_final_answer(unnormalized_answer)

-    if is_equiv(answer, doc["answer"]):
+    if answer == INVALID_ANSWER:
+        return {"exact_match": 0}
+
+    if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
        retval = 1
    else:
        retval = 0
@@ -112,17 +118,19 @@ def last_boxed_only_string(string: str) -> Optional[str]:


 def remove_boxed(s: str) -> str:
-    if "\\boxed " in s:
-        left = "\\boxed "
-        assert s[: len(left)] == left
-        return s[len(left) :]
-
-    left = "\\boxed{"
+    try:
+        if "\\boxed " in s:
+            left = "\\boxed "
+            assert s[: len(left)] == left
+            return s[len(left) :]

-    assert s[: len(left)] == left
-    assert s[-1] == "}"
+        left = "\\boxed{"

-    return s[len(left) : -1]
+        assert s[: len(left)] == left
+        assert s[-1] == "}"
+        return s[len(left) : -1]
+    except AssertionError:
+        return INVALID_ANSWER


 class timeout:
@@ -146,7 +154,7 @@ def is_equiv(x1: str, x2: str) -> bool:
    x1 and x2 are normalized latex string
    """
    try:
-        with timeout(seconds=5):
+        with timeout(seconds=1):
            try:
                parsed_x1 = parse_latex(x1)
                parsed_x2 = parse_latex(x2)
@@ -185,7 +193,6 @@ def is_equiv(x1: str, x2: str) -> bool:


 def get_unnormalized_answer(text: str) -> str:
-    INVALID_ANSWER = "[invalidanswer]"
    end_seq = "I hope it is correct."
    text += end_seq
    match = re.search(