Support add_generation_prompt in embeddings endpoint with chat request (#23931)

Signed-off-by: biba10 <jaksmid@seznam.cz>

Support add_generation_prompt in embeddings endpoint with chat request (#23931)
Signed-off-by: biba10 <jaksmid@seznam.cz>
28f350e1 · Jakub Smid · GitHub · 51383bd4 · 28f350e1 · 28f350e1
Unverified Commit 28f350e1 authored Sep 03, 2025 by Jakub Smid Committed by GitHub Sep 03, 2025
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 3 deletions

vllm/entrypoints/openai/protocol.py vllm/entrypoints/openai/protocol.py +8 -0

vllm/entrypoints/openai/serving_embedding.py vllm/entrypoints/openai/serving_embedding.py +1 -3

No files found.
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1342,6 +1342,14 @@ class EmbeddingChatRequest(OpenAIBaseModel):
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
    # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
    add_special_tokens: bool = Field(
        default=False,
        description=(

--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -93,9 +93,7 @@ class EmbeddingMixin(OpenAIServing):
                    or ctx.chat_template,
                    chat_template_content_format=ctx.
                    chat_template_content_format,
-                    # In embedding requests, we are not generating tokens,
+                    add_generation_prompt=ctx.request.add_generation_prompt,
-                    # so there is no need to append extra tokens to the input
-                    add_generation_prompt=False,
                    continue_final_message=False,
                    add_special_tokens=ctx.request.add_special_tokens,
                )