Add chat template to `vllm` (#2034)

* add chat template * refactor token padding * nit * nit * check on failing test * check transformers version * remove transformers pin * add ids to test * nit * fixup * fix bos bug * nit * fixup! fix bos bug * increase tolerance for table test * don't detokenize vllm logprobs * Update lm_eval/models/utils.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * pre-commit run --all-files --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

Add chat template to `vllm` (#2034)
* add chat template * refactor token padding * nit * nit * check on failing test * check transformers version * remove transformers pin * add ids to test * nit * fixup * fix bos bug * nit * fixup! fix bos bug * increase tolerance for table test * don't detokenize vllm logprobs * Update lm_eval/models/utils.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * pre-commit run --all-files --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
cc2d3463 · Baber Abbasi · GitHub · e922cceb · cc2d3463 · cc2d3463
Unverified Commit cc2d3463 authored Jun 28, 2024 by Baber Abbasi Committed by GitHub Jun 28, 2024
5 changed files
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
    clear_torch_cache,
+    configure_pad_token,
    get_dtype,
    pad_and_concat,
    stop_sequences_criteria,
@@ -253,28 +254,7 @@ class HFLM(TemplateLM):
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
        # select (or create) a pad token to use
-        if self.tokenizer.pad_token:
-            pass
-        elif self.tokenizer.unk_token:
-            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
-        elif self.tokenizer.eos_token:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        else:
-            if getattr(self.config, "model_type", None) == "qwen":
-                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
-                self.tokenizer.pad_token = "<|endoftext|>"
-            elif (
-                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
-                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
-            ):
-                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
-                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
-                # ---
-                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
-                # https://github.com/huggingface/transformers/pull/26963
-                assert self.tokenizer.pad_token_id == 0
-            else:
-                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)

        # TODO: override this for Gemma
        self.add_bos_token = add_bos_token

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -5,6 +5,7 @@ import itertools
 import time
 from functools import wraps
 from typing import (
+    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
@@ -24,6 +25,11 @@ import transformers
 from lm_eval.utils import eval_logger


+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
+
+
 def chunks(iter, n: int = 0, fn=None):
    """
    Divides an iterable into chunks of specified size or based on a given function.
@@ -613,3 +619,48 @@ class Collator:

        if arr:
            yield arr
+
+
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+
+    Returns:
+        The tokenizer after the padding token has been handled.
+
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+    return tokenizer
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
 from importlib.metadata import version
 from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union

 from more_itertools import distribute
 from packaging.version import parse as parse_version
@@ -10,7 +10,7 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.utils import Collator, configure_pad_token, undistribute
 from lm_eval.utils import (
    eval_logger,
    get_rolling_token_windows,
@@ -26,6 +26,8 @@ try:
 except ModuleNotFoundError:
    pass

+if TYPE_CHECKING:
+    pass

 eval_logger = eval_logger

@@ -118,6 +120,7 @@ class VLLM(TemplateLM):
            trust_remote_code=trust_remote_code,
            tokenizer_revision=tokenizer_revision,
        )
+        self.tokenizer = configure_pad_token(self.tokenizer)
        self.add_bos_token = add_bos_token
        if "gemma" in pretrained.lower():
            self.add_bos_token = True
@@ -176,23 +179,46 @@ class VLLM(TemplateLM):
    def max_gen_toks(self):
        return self._max_gen_toks

+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+
    def tok_encode(
        self,
-        string: str,
-        left_truncate_len=None,
-        add_special_tokens=None,
-        truncation=False,
-    ):
-        """ """
+        string: Union[str, List[str]],
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+    ) -> Union[List[int], List[List[int]]]:
        if not add_special_tokens:
            add_special_tokens = False or self.add_bos_token
-        encoding = self.tokenizer.encode(
-            string, add_special_tokens=add_special_tokens, truncation=truncation
-        )
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            string,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids

        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
        if left_truncate_len:
-            encoding = encoding[-left_truncate_len:]
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]

        return encoding

@@ -209,7 +235,7 @@ class VLLM(TemplateLM):
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=1, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
@@ -290,7 +316,9 @@ class VLLM(TemplateLM):

        # batch tokenize contexts
        context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
        requests = [
            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
        ]

--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -132,7 +132,7 @@ class _SCROLLSTask(ConfigurableTask):

    def training_docs(self):
        processed_docs = list(map(self._process_doc, self.dataset["train"]))
-        
+
        # Flatten the list of lists since _process_doc returns a list of one element.
        processed_docs = [item for sublist in processed_docs for item in sublist]
        processed_dict = {
@@ -143,13 +143,13 @@ class _SCROLLSTask(ConfigurableTask):

    def validation_docs(self):
        processed_docs = list(map(self._process_doc, self.dataset["validation"]))
-        
+
        # Flatten the list of lists since _process_doc returns a list of one element.
        processed_docs = [item for sublist in processed_docs for item in sublist]
        processed_dict = {
            key: [d[key] for d in processed_docs] for key in processed_docs[0]
        }
-        
+
        return Dataset.from_dict(processed_dict)

    def should_decontaminate(self):

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -33,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
            10000,
        ),
    ],
+    ids=lambda d: f"{d}",
 )
 def test_evaluator(
    task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
@@ -107,6 +108,7 @@ def test_evaluator(
            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
        ),
    ],
+    ids=lambda d: f"{d}",
 )
 def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
    results = evaluator.simple_evaluate(
@@ -144,6 +146,6 @@ def test_printed_results(task_name: List[str], limit: int, model: str, model_arg
            try:
                t1_item = float(t1_item)
                t2_item = float(t2_item)
-                assert abs(t1_item - t2_item) < 0.1
+                assert abs(t1_item - t2_item) < 0.3
            except ValueError:
                assert t1_item == t2_item