[V1] Avoid sending text prompt to core engine (#11963)

Signed-off-by: Roger Wang <ywang@roblox.com>

[V1] Avoid sending text prompt to core engine (#11963)
Signed-off-by: Roger Wang <ywang@roblox.com>
b25cfab9 · Roger Wang · GitHub · 4b657d32 · b25cfab9 · b25cfab9
Unverified Commit b25cfab9 authored Jan 11, 2025 by Roger Wang Committed by GitHub Jan 12, 2025
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

vllm/v1/engine/__init__.py vllm/v1/engine/__init__.py +2 -2

vllm/v1/engine/core_client.py vllm/v1/engine/core_client.py +6 -0

No files found.
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -19,8 +19,8 @@ class EngineCoreRequest:
    # due to circular imports and typing we have in data.py
    request_id: str
-    #NOTE(Nick): I don't think we need to pass prompt here since it should
+    # NOTE(ywang96): original text prompt is needed when a request is added to
-    # always be tokenized?
+    # Detokenizer, but set to None when it is added to EngineCoreClient.
    prompt: Optional[str]
    prompt_token_ids: List[int]
    mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]

--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -219,6 +219,9 @@ class SyncMPClient(MPClient):
        self.input_socket.send_multipart(msg, copy=False)
    def add_request(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
        self._send_input(EngineCoreRequestType.ADD, request)
    def abort_requests(self, request_ids: List[str]) -> None:
@@ -257,6 +260,9 @@ class AsyncMPClient(MPClient):
        await self.input_socket.send_multipart(msg, copy=False)
    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
        await self._send_input(EngineCoreRequestType.ADD, request)
    async def abort_requests_async(self, request_ids: List[str]) -> None: