Unverified Commit 77c811ea authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

Fix chat template; fix leaderboard math (#2475)

* batch commit

* :Revert "batch commit"

This reverts commit d859d1ca

.

* batch commit

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* Chat template fix (#7)

* cleanup

* cleanup

* cleanup

* linting

* fix tests

* add ifeval install to new_task CI

* Revert "add ifeval install to new_task CI"

This reverts commit 1d19449bb7fbfa05d51e7cd20950475eae533bf1.

* adds leaderboard tasks (#1)

* adds leaderboard tasks

* Delete lm_eval/tasks/leaderboard/leaderboard_chat_template.yaml

* add readme

* Delete lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro_chat_template.yaml

* modify readme

* fix bbh task

* fix bbh salient task

* modify the readme

* Delete lm_eval/tasks/leaderboard/ifeval/README.md

* Delete lm_eval/tasks/leaderboard/math/README.md

* add leaderboard to the tasks repertory

* add anouncment about new leaderbaord tasks

* linting

* Update README.md
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* installs ifeval dependency in new_task github workflow

---------
Co-authored-by: default avatarNathan Habib <nathan.habib@huggingface.com>
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* fix math parser

* fix math parser

* fix version

* add warning about chat template

---------
Co-authored-by: default avatarNathan Habib <nathan.habib@huggingface.co>
Co-authored-by: default avatarNathan Habib <30601243+NathanHB@users.noreply.github.com>
Co-authored-by: default avatarNathan Habib <nathan.habib@huggingface.com>
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Co-authored-by: default avatarNathan Habib <nathan.habib19@gmail.com>
parent bd80a6c0
# Chat Template Delimiter Handling Update
## Overview
This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.
## Background
By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
```
doc_to_text(doc) + target_delimiter + doc_to_target(doc)
```
While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.
## The Change
- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
- This prevents interference between chat template formatting and the default delimiter system
- Particularly important for multiple choice tasks where the template itself handles spacing
## Example
```
# Before (with default delimiter " ")
<user>Question: What color is the sky?\nAnswer:<assistant> blue
# After
<user>Question: What color is the sky?\nAnswer:<assistant>blue
```
......@@ -449,6 +449,7 @@ class Task(abc.ABC):
doc=doc,
ctx=fewshot_ctx,
metadata=(self.config["task"], doc_id, self.config.repeats),
apply_chat_template=apply_chat_template,
)
if not isinstance(inst, list):
......@@ -1301,6 +1302,8 @@ class ConfigurableTask(Task):
def construct_requests(
self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]:
apply_chat_template = kwargs.pop("apply_chat_template", False)
aux_arguments = None
if self.OUTPUT_TYPE == "loglikelihood":
......@@ -1310,6 +1313,8 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice":
choices = self.doc_to_choice(doc)
target_delimiter = self.config.target_delimiter
if apply_chat_template:
target_delimiter = ""
if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
......
......@@ -400,6 +400,11 @@ def evaluate(
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
if apply_chat_template:
eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
)
# tracks all Instances/requests a model must generate output on.
requests = defaultdict(list)
# stores the amount to pad out reqs per req. type so that
......
......@@ -4,6 +4,7 @@ from datetime import timedelta
from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, Union
import jinja2
import torch
import torch.nn.functional as F
import transformers
......@@ -1344,9 +1345,20 @@ class HFLM(TemplateLM):
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
try:
chat_templated = self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
except jinja2.exceptions.TemplateError:
eval_logger.warning(
"Failed to apply chat template. removing the system role in chat history."
)
chat_history = [msg for msg in chat_history if msg["role"] != "system"]
chat_templated = self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
return chat_templated
def get_model_info(self) -> dict:
"""
......
......@@ -18,7 +18,7 @@ metric_list:
higher_is_better: true
num_fewshot: 4
metadata:
version: 1.0
version: 2.0
dataset_kwargs:
trust_remote_code: true
fewshot_config:
......
......@@ -17,6 +17,9 @@ please install sympy via pip install lm-eval[math] or pip install -e .[math]",
)
INVALID_ANSWER = "[invalidanswer]"
# taken from
# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
def doc_to_text(doc: dict) -> str:
......@@ -70,7 +73,10 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer)
if is_equiv(answer, doc["answer"]):
if answer == INVALID_ANSWER:
return {"exact_match": 0}
if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
retval = 1
else:
retval = 0
......@@ -112,17 +118,19 @@ def last_boxed_only_string(string: str) -> Optional[str]:
def remove_boxed(s: str) -> str:
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
try:
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
assert s[: len(left)] == left
assert s[-1] == "}"
left = "\\boxed{"
return s[len(left) : -1]
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
except AssertionError:
return INVALID_ANSWER
class timeout:
......@@ -146,7 +154,7 @@ def is_equiv(x1: str, x2: str) -> bool:
x1 and x2 are normalized latex string
"""
try:
with timeout(seconds=5):
with timeout(seconds=1):
try:
parsed_x1 = parse_latex(x1)
parsed_x2 = parse_latex(x2)
......@@ -185,7 +193,6 @@ def is_equiv(x1: str, x2: str) -> bool:
def get_unnormalized_answer(text: str) -> str:
INVALID_ANSWER = "[invalidanswer]"
end_seq = "I hope it is correct."
text += end_seq
match = re.search(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment