[Frontend] OpenAI API server: Add `add_special_tokens` to...

[Frontend] OpenAI API server: Add `add_special_tokens` to ChatCompletionRequest (default False) (#5278)

[Frontend] OpenAI API server: Add `add_special_tokens` to...
[Frontend] OpenAI API server: Add `add_special_tokens` to ChatCompletionRequest (default False) (#5278)
f0a50054 · tomeras91 · GitHub · c65146e7 · f0a50054 · f0a50054
Unverified Commit f0a50054 authored Jun 05, 2024 by tomeras91 Committed by GitHub Jun 05, 2024
3 changed files
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -176,6 +176,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )
+    add_special_tokens: Optional[bool] = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to False (as is the "
+            "default)."),
+    )
    include_stop_str_in_output: Optional[bool] = Field(
        default=False,
        description=(

--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -163,7 +163,9 @@ class OpenAIServingChat(OpenAIServing):
        try:
            # Tokenize/detokenize depending on prompt format (string/token list)
            prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
-                request, prompt=prompt, add_special_tokens=False)
+                request,
+                prompt=prompt,
+                add_special_tokens=request.add_special_tokens)
            sampling_params = request.to_sampling_params()
            lora_request = self._maybe_get_lora(request)
            decoding_config = await self.engine.get_decoding_config()

--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -131,7 +131,8 @@ class OpenAIServing:
            prompt_ids: Optional[List[int]] = None,
            truncate_prompt_tokens: Optional[Annotated[int,
                                                       Field(ge=1)]] = None,
-            add_special_tokens: bool = True) -> Tuple[List[int], str]:
+            add_special_tokens: Optional[bool] = True
+    ) -> Tuple[List[int], str]:
        if not (prompt or prompt_ids):
            raise ValueError("Either prompt or prompt_ids should be provided.")
        if (prompt and prompt_ids):
@@ -139,11 +140,12 @@ class OpenAIServing:
                "Only one of prompt or prompt_ids should be provided.")

        if prompt_ids is None:
-            # When using OpenAIServingChat for chat completions, the
-            # special tokens (e.g., BOS) have already been added by the
-            # chat template. Therefore, we do not need to add them again.
-            # Set add_special_tokens to False to avoid adding the BOS tokens
-            # again.
+            # When using OpenAIServingChat for chat completions, for
+            # most models the special tokens (e.g., BOS) have already
+            # been added by the chat template. Therefore, we do not
+            # need to add them again.
+            # Set add_special_tokens to False (by default) to avoid
+            # adding the BOS tokens again.
            tokenizer_kwargs: Dict[str, Any] = {
                "add_special_tokens": add_special_tokens
            }