Unverified Commit 77c811ea authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

Fix chat template; fix leaderboard math (#2475)

* batch commit

* :Revert "batch commit"

This reverts commit d859d1ca

.

* batch commit

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* Chat template fix (#7)

* cleanup

* cleanup

* cleanup

* linting

* fix tests

* add ifeval install to new_task CI

* Revert "add ifeval install to new_task CI"

This reverts commit 1d19449bb7fbfa05d51e7cd20950475eae533bf1.

* adds leaderboard tasks (#1)

* adds leaderboard tasks

* Delete lm_eval/tasks/leaderboard/leaderboard_chat_template.yaml

* add readme

* Delete lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro_chat_template.yaml

* modify readme

* fix bbh task

* fix bbh salient task

* modify the readme

* Delete lm_eval/tasks/leaderboard/ifeval/README.md

* Delete lm_eval/tasks/leaderboard/math/README.md

* add leaderboard to the tasks repertory

* add anouncment about new leaderbaord tasks

* linting

* Update README.md
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* installs ifeval dependency in new_task github workflow

---------
Co-authored-by: default avatarNathan Habib <nathan.habib@huggingface.com>
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* fix math parser

* fix math parser

* fix version

* add warning about chat template

---------
Co-authored-by: default avatarNathan Habib <nathan.habib@huggingface.co>
Co-authored-by: default avatarNathan Habib <30601243+NathanHB@users.noreply.github.com>
Co-authored-by: default avatarNathan Habib <nathan.habib@huggingface.com>
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Co-authored-by: default avatarNathan Habib <nathan.habib19@gmail.com>
parent bd80a6c0
# Chat Template Delimiter Handling Update
## Overview
This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.
## Background
By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
```
doc_to_text(doc) + target_delimiter + doc_to_target(doc)
```
While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.
## The Change
- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
- This prevents interference between chat template formatting and the default delimiter system
- Particularly important for multiple choice tasks where the template itself handles spacing
## Example
```
# Before (with default delimiter " ")
<user>Question: What color is the sky?\nAnswer:<assistant> blue
# After
<user>Question: What color is the sky?\nAnswer:<assistant>blue
```
...@@ -449,6 +449,7 @@ class Task(abc.ABC): ...@@ -449,6 +449,7 @@ class Task(abc.ABC):
doc=doc, doc=doc,
ctx=fewshot_ctx, ctx=fewshot_ctx,
metadata=(self.config["task"], doc_id, self.config.repeats), metadata=(self.config["task"], doc_id, self.config.repeats),
apply_chat_template=apply_chat_template,
) )
if not isinstance(inst, list): if not isinstance(inst, list):
...@@ -1301,6 +1302,8 @@ class ConfigurableTask(Task): ...@@ -1301,6 +1302,8 @@ class ConfigurableTask(Task):
def construct_requests( def construct_requests(
self, doc: dict, ctx: str, **kwargs self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]: ) -> Union[List[Instance], Instance]:
apply_chat_template = kwargs.pop("apply_chat_template", False)
aux_arguments = None aux_arguments = None
if self.OUTPUT_TYPE == "loglikelihood": if self.OUTPUT_TYPE == "loglikelihood":
...@@ -1310,6 +1313,8 @@ class ConfigurableTask(Task): ...@@ -1310,6 +1313,8 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice": elif self.OUTPUT_TYPE == "multiple_choice":
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
target_delimiter = self.config.target_delimiter target_delimiter = self.config.target_delimiter
if apply_chat_template:
target_delimiter = ""
if self.multiple_input: if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx # If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc) cont = self.doc_to_target(doc)
......
...@@ -400,6 +400,11 @@ def evaluate( ...@@ -400,6 +400,11 @@ def evaluate(
eval_logger.setLevel(getattr(logging, f"{verbosity}")) eval_logger.setLevel(getattr(logging, f"{verbosity}"))
if apply_chat_template:
eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
)
# tracks all Instances/requests a model must generate output on. # tracks all Instances/requests a model must generate output on.
requests = defaultdict(list) requests = defaultdict(list)
# stores the amount to pad out reqs per req. type so that # stores the amount to pad out reqs per req. type so that
......
...@@ -4,6 +4,7 @@ from datetime import timedelta ...@@ -4,6 +4,7 @@ from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, Union from typing import Dict, List, Literal, Optional, Tuple, Union
import jinja2
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import transformers import transformers
...@@ -1344,9 +1345,20 @@ class HFLM(TemplateLM): ...@@ -1344,9 +1345,20 @@ class HFLM(TemplateLM):
""" """
Method to apply a chat template to a list of chat history between user and model. Method to apply a chat template to a list of chat history between user and model.
""" """
return self.tokenizer.apply_chat_template( try:
chat_history, tokenize=False, add_generation_prompt=True chat_templated = self.tokenizer.apply_chat_template(
) chat_history, tokenize=False, add_generation_prompt=True
)
except jinja2.exceptions.TemplateError:
eval_logger.warning(
"Failed to apply chat template. removing the system role in chat history."
)
chat_history = [msg for msg in chat_history if msg["role"] != "system"]
chat_templated = self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
return chat_templated
def get_model_info(self) -> dict: def get_model_info(self) -> dict:
""" """
......
...@@ -18,7 +18,7 @@ metric_list: ...@@ -18,7 +18,7 @@ metric_list:
higher_is_better: true higher_is_better: true
num_fewshot: 4 num_fewshot: 4
metadata: metadata:
version: 1.0 version: 2.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
fewshot_config: fewshot_config:
......
...@@ -17,6 +17,9 @@ please install sympy via pip install lm-eval[math] or pip install -e .[math]", ...@@ -17,6 +17,9 @@ please install sympy via pip install lm-eval[math] or pip install -e .[math]",
) )
INVALID_ANSWER = "[invalidanswer]"
# taken from # taken from
# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py # https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
def doc_to_text(doc: dict) -> str: def doc_to_text(doc: dict) -> str:
...@@ -70,7 +73,10 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]: ...@@ -70,7 +73,10 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
unnormalized_answer = get_unnormalized_answer(candidates) unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer) answer = normalize_final_answer(unnormalized_answer)
if is_equiv(answer, doc["answer"]): if answer == INVALID_ANSWER:
return {"exact_match": 0}
if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
retval = 1 retval = 1
else: else:
retval = 0 retval = 0
...@@ -112,17 +118,19 @@ def last_boxed_only_string(string: str) -> Optional[str]: ...@@ -112,17 +118,19 @@ def last_boxed_only_string(string: str) -> Optional[str]:
def remove_boxed(s: str) -> str: def remove_boxed(s: str) -> str:
if "\\boxed " in s: try:
left = "\\boxed " if "\\boxed " in s:
assert s[: len(left)] == left left = "\\boxed "
return s[len(left) :] assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
assert s[: len(left)] == left left = "\\boxed{"
assert s[-1] == "}"
return s[len(left) : -1] assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
except AssertionError:
return INVALID_ANSWER
class timeout: class timeout:
...@@ -146,7 +154,7 @@ def is_equiv(x1: str, x2: str) -> bool: ...@@ -146,7 +154,7 @@ def is_equiv(x1: str, x2: str) -> bool:
x1 and x2 are normalized latex string x1 and x2 are normalized latex string
""" """
try: try:
with timeout(seconds=5): with timeout(seconds=1):
try: try:
parsed_x1 = parse_latex(x1) parsed_x1 = parse_latex(x1)
parsed_x2 = parse_latex(x2) parsed_x2 = parse_latex(x2)
...@@ -185,7 +193,6 @@ def is_equiv(x1: str, x2: str) -> bool: ...@@ -185,7 +193,6 @@ def is_equiv(x1: str, x2: str) -> bool:
def get_unnormalized_answer(text: str) -> str: def get_unnormalized_answer(text: str) -> str:
INVALID_ANSWER = "[invalidanswer]"
end_seq = "I hope it is correct." end_seq = "I hope it is correct."
text += end_seq text += end_seq
match = re.search( match = re.search(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment