[Frontend] OpenAI API server: Do not add bos token by default when encoding (#4688)

0150a106 · bofeng huang · GitHub · 8e7fb5d4 · 0150a106 · 0150a106
Unverified Commit 0150a106 authored May 17, 2024 by bofeng huang Committed by GitHub May 16, 2024
Showing with 22 additions and 12 deletions

vllm/entrypoints/openai/serving_chat.py vllm/entrypoints/openai/serving_chat.py +1 -1

vllm/entrypoints/openai/serving_engine.py vllm/entrypoints/openai/serving_engine.py +21 -11

No files found.
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -158,7 +158,7 @@ class OpenAIServingChat(OpenAIServing):
        try:
            # Tokenize/detokenize depending on prompt format (string/token list)
            prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
-                request, prompt=prompt)
+                request, prompt=prompt, add_special_tokens=False)
            sampling_params = request.to_sampling_params()
            lora_request = self._maybe_get_lora(request)
            decoding_config = await self.engine.get_decoding_config()

--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
 import json
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 from pydantic import Field
 from typing_extensions import Annotated
@@ -170,8 +170,9 @@ class OpenAIServing:
                           EmbeddingRequest],
            prompt: Optional[str] = None,
            prompt_ids: Optional[List[int]] = None,
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
-    ) -> Tuple[List[int], str]:
+            truncate_prompt_tokens: Optional[Annotated[int,
+                                                       Field(ge=1)]] = None,
+            add_special_tokens: bool = True) -> Tuple[List[int], str]:
        if not (prompt or prompt_ids):
            raise ValueError("Either prompt or prompt_ids should be provided.")
        if (prompt and prompt_ids):
@@ -179,10 +180,19 @@ class OpenAIServing:
                "Only one of prompt or prompt_ids should be provided.")

        if prompt_ids is None:
-            tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
+            # When using OpenAIServingChat for chat completions, the
+            # special tokens (e.g., BOS) have already been added by the
+            # chat template. Therefore, we do not need to add them again.
+            # Set add_special_tokens to False to avoid adding the BOS tokens
+            # again.
+            tokenizer_kwargs: Dict[str, Any] = {
+                "add_special_tokens": add_special_tokens
+            }
+            if truncate_prompt_tokens is not None:
+                tokenizer_kwargs.update({
                    "truncation": True,
                    "max_length": truncate_prompt_tokens,
-            }
+                })
            input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
        elif truncate_prompt_tokens is not None:
            input_ids = prompt_ids[-truncate_prompt_tokens:]